diff --git a/.github/workflows/ci-riscv64.yml b/.github/workflows/ci-riscv64.yml index 2228a4f17df..1029e14986d 100644 --- a/.github/workflows/ci-riscv64.yml +++ b/.github/workflows/ci-riscv64.yml @@ -85,7 +85,9 @@ jobs: for i in *.deb; do dpkg-deb -x $i ../extract; done for i in include lib; do sudo rsync -av ../extract/usr/${i}/riscv64-linux-gnu/ /usr/riscv64-linux-gnu/${i}/; done sudo rsync -av ../extract/usr/include/ /usr/riscv64-linux-gnu/include/ - sudo rsync -av ../extract/lib/riscv64-linux-gnu/ /usr/riscv64-linux-gnu/lib/ + if test -e "../extract/lib/riscv64-linux-gnu/"; then \ + sudo rsync -av ../extract/lib/riscv64-linux-gnu/ /usr/riscv64-linux-gnu/lib/; \ + fi - name: Run Suite working-directory: ${{ github.workspace }} diff --git a/clients/drcachesim/analyzer.cpp b/clients/drcachesim/analyzer.cpp index 8b0452f3d91..c158c00cd84 100644 --- a/clients/drcachesim/analyzer.cpp +++ b/clients/drcachesim/analyzer.cpp @@ -133,6 +133,17 @@ analyzer_t::create_wait_marker() return record; } +template <> +memref_t +analyzer_t::create_idle_marker() +{ + memref_t record = {}; // Zero the other fields. + record.marker.type = TRACE_TYPE_MARKER; + record.marker.marker_type = TRACE_MARKER_TYPE_CORE_IDLE; + record.marker.tid = INVALID_THREAD_ID; + return record; +} + /****************************************************************************** * Specializations for analyzer_tmpl_t, aka record_analyzer_t. */ @@ -182,6 +193,17 @@ record_analyzer_t::create_wait_marker() return record; } +template <> +trace_entry_t +record_analyzer_t::create_idle_marker() +{ + trace_entry_t record; + record.type = TRACE_TYPE_MARKER; + record.size = TRACE_MARKER_TYPE_CORE_IDLE; + record.addr = 0; // Marker value has no meaning so we zero it. + return record; +} + /******************************************************************** * Other analyzer_tmpl_t routines that do not need to be specialized. */ @@ -537,6 +559,12 @@ analyzer_tmpl_t::process_tasks(analyzer_worker_data_t *w // We synthesize a record here. If we wanted this to count toward output // stream ordinals we would need to add a scheduler API to inject it. record = create_wait_marker(); + } else if (status == sched_type_t::STATUS_IDLE) { + assert(shard_type_ == SHARD_BY_CORE); + // We let tools know about idle time so they can analyze cpu usage. + // We synthesize a record here. If we wanted this to count toward output + // stream ordinals we would need to add a scheduler API to inject it. + record = create_idle_marker(); } else if (status != sched_type_t::STATUS_OK) { if (status == sched_type_t::STATUS_REGION_INVALID) { worker->error = @@ -596,8 +624,10 @@ analyzer_tmpl_t::process_tasks(analyzer_worker_data_t *w } } if (shard_type_ == SHARD_BY_CORE) { - if (!process_shard_exit(worker, worker->index)) - return; + if (worker->shard_data.find(worker->index) != worker->shard_data.end()) { + if (!process_shard_exit(worker, worker->index)) + return; + } } for (const auto &keyval : worker->shard_data) { if (!keyval.second.exited) { diff --git a/clients/drcachesim/analyzer.h b/clients/drcachesim/analyzer.h index 35a1df83a56..8ebc10547b2 100644 --- a/clients/drcachesim/analyzer.h +++ b/clients/drcachesim/analyzer.h @@ -252,6 +252,9 @@ template class analyzer_tmpl_t { RecordType create_wait_marker(); + RecordType + create_idle_marker(); + // Invoked when the given interval finishes during serial or parallel // analysis of the trace. For parallel analysis, the shard_id // parameter should be set to the shard_id for which the interval diff --git a/clients/drcachesim/common/options.cpp b/clients/drcachesim/common/options.cpp index 71d7c5ac2f1..53fceb2c03d 100644 --- a/clients/drcachesim/common/options.cpp +++ b/clients/drcachesim/common/options.cpp @@ -809,7 +809,7 @@ droption_t op_core_sharded( "software threads. This option instead schedules those threads onto virtual cores " "and analyzes each core in parallel. Thus, each shard consists of pieces from " "many software threads. How the scheduling is performed is controlled by a set " - "of options with the prefix \"sched_\" along with -num_cores."); + "of options with the prefix \"sched_\" along with -cores."); droption_t op_core_serial( DROPTION_SCOPE_ALL, "core_serial", false, "Analyze per-core in serial.", @@ -817,7 +817,7 @@ droption_t op_core_serial( "However, the resulting schedule is acted upon by a single analysis thread" "which walks the N cores in lockstep in round robin fashion. " "How the scheduling is performed is controlled by a set " - "of options with the prefix \"sched_\" along with -num_cores."); + "of options with the prefix \"sched_\" along with -cores."); droption_t op_sched_quantum(DROPTION_SCOPE_ALL, "sched_quantum", 1 * 1000 * 1000, diff --git a/clients/drcachesim/common/trace_entry.h b/clients/drcachesim/common/trace_entry.h index c885ed1a72a..166c5f32055 100644 --- a/clients/drcachesim/common/trace_entry.h +++ b/clients/drcachesim/common/trace_entry.h @@ -583,6 +583,19 @@ typedef enum { */ TRACE_MARKER_TYPE_CORE_WAIT, + /** + * This marker is used for core-sharded analyses to indicate that the current + * core has no available inputs to run (all inputs are on other cores or are + * blocked waiting for kernel resources). A new marker is emitted each + * time the tool analysis framework requests a new record from the scheduler and + * is given an idle status. There are no units of time here but each repetition + * is roughly the time where a regular record could have been read and passed + * along. This idle marker indicates that a core actually had no work to do, + * as opposed to #TRACE_MARKER_TYPE_CORE_WAIT which is an artifact of an + * imposed re-created schedule. + */ + TRACE_MARKER_TYPE_CORE_IDLE, + // ... // These values are reserved for future built-in marker types. // ... diff --git a/clients/drcachesim/reader/reader.h b/clients/drcachesim/reader/reader.h index ad2b64b7f82..98dcae855c2 100644 --- a/clients/drcachesim/reader/reader.h +++ b/clients/drcachesim/reader/reader.h @@ -193,7 +193,8 @@ class reader_t : public std::iterator, is_record_synthetic() const override { if (cur_ref_.marker.type == TRACE_TYPE_MARKER && - cur_ref_.marker.marker_type == TRACE_MARKER_TYPE_CORE_WAIT) { + (cur_ref_.marker.marker_type == TRACE_MARKER_TYPE_CORE_WAIT || + cur_ref_.marker.marker_type == TRACE_MARKER_TYPE_CORE_IDLE)) { // These are synthetic records not part of the input and not // counting toward ordinals. return true; diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp index 14fda834445..e9494f44a80 100644 --- a/clients/drcachesim/scheduler/scheduler.cpp +++ b/clients/drcachesim/scheduler/scheduler.cpp @@ -631,6 +631,8 @@ scheduler_tmpl_t::init( } } } + VPRINT(this, 1, "%zu inputs\n", inputs_.size()); + live_input_count_.store(static_cast(inputs_.size()), std::memory_order_release); return set_initial_schedule(workload2inputs); } @@ -1313,7 +1315,7 @@ scheduler_tmpl_t::advance_region_of_interest( input.cur_region); if (input.cur_region >= static_cast(input.regions_of_interest.size())) { if (input.at_eof) - return sched_type_t::STATUS_EOF; + return eof_or_idle(output); else { // We let the user know we're done. if (options_.schedule_record_ostream != nullptr) { @@ -1329,7 +1331,7 @@ scheduler_tmpl_t::advance_region_of_interest( return status; } input.queue.push_back(create_thread_exit(input.tid)); - input.at_eof = true; + mark_input_eof(input); return sched_type_t::STATUS_SKIPPED; } } @@ -1408,7 +1410,7 @@ scheduler_tmpl_t::skip_instructions(output_ordinal_t out if (*input.reader == *input.reader_end) { // Raise error because the input region is out of bounds. VPRINT(this, 2, "skip_instructions: input=%d skip out of bounds\n", input.index); - input.at_eof = true; + mark_input_eof(input); return sched_type_t::STATUS_REGION_INVALID; } input.in_cur_region = true; @@ -1645,7 +1647,7 @@ scheduler_tmpl_t::pick_next_input_as_previously( { if (outputs_[output].record_index + 1 >= static_cast(outputs_[output].record.size())) - return sched_type_t::STATUS_EOF; + return eof_or_idle(output); const schedule_record_t &segment = outputs_[output].record[outputs_[output].record_index + 1]; index = segment.key.input; @@ -1681,6 +1683,11 @@ scheduler_tmpl_t::pick_next_input_as_previously( // XXX i#5843: We may want to provide a kernel-mediated wait // feature so a multi-threaded simulator doesn't have to do a // spinning poll loop. + // XXX i#5843: For replaying a schedule as it was traced with + // MAP_TO_RECORDED_OUTPUT there may have been true idle periods during + // tracing where some other process than the traced workload was + // scheduled on a core. If we could identify those, we should return + // STATUS_IDLE rather than STATUS_WAIT. VPRINT(this, 3, "next_record[%d]: waiting for input %d instr #%" PRId64 "\n", output, index, segment.start_instruction); // Give up this input and go into a wait state. @@ -1719,7 +1726,7 @@ scheduler_tmpl_t::pick_next_input_as_previously( // queued candidate record, if any. clear_input_queue(inputs_[index]); inputs_[index].queue.push_back(create_thread_exit(inputs_[index].tid)); - inputs_[index].at_eof = true; + mark_input_eof(inputs_[index]); VPRINT(this, 2, "early end for input %d\n", index); // We're done with this entry but we need the queued record to be read, // so we do not move past the entry. @@ -1773,7 +1780,11 @@ scheduler_tmpl_t::pick_next_input(output_ordinal_t outpu const schedule_record_t &segment = outputs_[output].record[outputs_[output].record_index]; int input = segment.key.input; - VPRINT(this, res == sched_type_t::STATUS_WAIT ? 3 : 2, + VPRINT(this, + (res == sched_type_t::STATUS_IDLE || + res == sched_type_t::STATUS_WAIT) + ? 3 + : 2, "next_record[%d]: replay segment in=%d (@%" PRId64 ") type=%d start=%" PRId64 " end=%" PRId64 "\n", output, input, @@ -1819,10 +1830,10 @@ scheduler_tmpl_t::pick_next_input(output_ordinal_t outpu // We found a direct switch target above. } else if (ready_queue_empty()) { if (prev_index == INVALID_INPUT_ORDINAL) - return sched_type_t::STATUS_EOF; + return eof_or_idle(output); std::lock_guard lock(*inputs_[prev_index].lock); if (inputs_[prev_index].at_eof) - return sched_type_t::STATUS_EOF; + return eof_or_idle(output); else index = prev_index; // Go back to prior. } else { @@ -1836,7 +1847,7 @@ scheduler_tmpl_t::pick_next_input(output_ordinal_t outpu } input_info_t *queue_next = pop_from_ready_queue(output); if (queue_next == nullptr) - return sched_type_t::STATUS_EOF; + return eof_or_idle(output); index = queue_next->index; } } else if (options_.deps == DEPENDENCY_TIMESTAMPS) { @@ -1850,7 +1861,7 @@ scheduler_tmpl_t::pick_next_input(output_ordinal_t outpu } } if (index < 0) - return sched_type_t::STATUS_EOF; + return eof_or_idle(output); VPRINT(this, 2, "next_record[%d]: advancing to timestamp %" PRIu64 " == input #%d\n", @@ -1883,14 +1894,15 @@ scheduler_tmpl_t::pick_next_input(output_ordinal_t outpu std::lock_guard lock(*inputs_[index].lock); if (inputs_[index].at_eof || *inputs_[index].reader == *inputs_[index].reader_end) { - VPRINT(this, 2, "next_record[%d]: local index %d == input #%d at eof\n", - output, outputs_[output].input_indices_index, index); + VPRINT(this, 2, "next_record[%d]: input #%d at eof\n", output, index); if (options_.schedule_record_ostream != nullptr && prev_index != INVALID_INPUT_ORDINAL) close_schedule_segment(output, inputs_[prev_index]); - inputs_[index].at_eof = true; + if (!inputs_[index].at_eof) + mark_input_eof(inputs_[index]); index = INVALID_INPUT_ORDINAL; // Loop and pick next thread. + prev_index = INVALID_INPUT_ORDINAL; continue; } break; @@ -1911,7 +1923,7 @@ scheduler_tmpl_t::next_record(output_ordinal_t output, // check for quantum end. outputs_[output].cur_time = cur_time; // Invalid values are checked below. if (!outputs_[output].active) - return sched_type_t::STATUS_WAIT; + return sched_type_t::STATUS_IDLE; if (outputs_[output].waiting) { VPRINT(this, 5, "next_record[%d]: need new input (cur=waiting)\n", output); sched_type_t::stream_status_t res = pick_next_input(output, true); @@ -1922,7 +1934,7 @@ scheduler_tmpl_t::next_record(output_ordinal_t output, if (outputs_[output].cur_input < 0) { // This happens with more outputs than inputs. For non-empty outputs we // require cur_input to be set to >=0 during init(). - return sched_type_t::STATUS_EOF; + return eof_or_idle(output); } input = &inputs_[outputs_[output].cur_input]; auto lock = std::unique_lock(*input->lock); @@ -1970,6 +1982,8 @@ scheduler_tmpl_t::next_record(output_ordinal_t output, input->needs_advance = true; } if (input->at_eof || *input->reader == *input->reader_end) { + if (!input->at_eof) + mark_input_eof(*input); lock.unlock(); VPRINT(this, 5, "next_record[%d]: need new input (cur=%d eof)\n", output, input->index); @@ -1998,6 +2012,7 @@ scheduler_tmpl_t::next_record(output_ordinal_t output, if (outputs_[output].record_index >= static_cast(outputs_[output].record.size())) { // We're on the last record. + VPRINT(this, 4, "next_record[%d]: on last record\n", output); } else if (outputs_[output].record[outputs_[output].record_index].type == schedule_record_t::SKIP) { VPRINT(this, 5, "next_record[%d]: need new input after skip\n", output); @@ -2257,6 +2272,28 @@ scheduler_tmpl_t::stop_speculation(output_ordinal_t outp return sched_type_t::STATUS_OK; } +template +void +scheduler_tmpl_t::mark_input_eof(input_info_t &input) +{ + input.at_eof = true; + assert(live_input_count_.load(std::memory_order_acquire) > 0); + live_input_count_.fetch_add(-1, std::memory_order_release); +} + +template +typename scheduler_tmpl_t::stream_status_t +scheduler_tmpl_t::eof_or_idle(output_ordinal_t output) +{ + if (options_.mapping == MAP_TO_CONSISTENT_OUTPUT || + live_input_count_.load(std::memory_order_acquire) == 0) { + return sched_type_t::STATUS_EOF; + } else { + outputs_[output].waiting = true; + return sched_type_t::STATUS_IDLE; + } +} + template typename scheduler_tmpl_t::stream_status_t scheduler_tmpl_t::set_output_active(output_ordinal_t output, diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h index 046c57e3085..bf5d1808921 100644 --- a/clients/drcachesim/scheduler/scheduler.h +++ b/clients/drcachesim/scheduler/scheduler.h @@ -45,6 +45,7 @@ #include #include +#include #include #include #include @@ -109,10 +110,12 @@ template class scheduler_tmpl_t { * For dynamic scheduling with cross-stream dependencies, the scheduler may pause * a stream if it gets ahead of another stream it should have a dependence on. * This value is also used for schedules following the recorded timestamps - * (#DEPENDENCY_TIMESTAMPS) to avoid one stream getting ahead of another. For - * replaying a schedule as it was traced with #MAP_TO_RECORDED_OUTPUT this can - * indicate an idle period on a core where the traced workload was not currently - * scheduled. + * (#DEPENDENCY_TIMESTAMPS) to avoid one stream getting ahead of another. + * #STATUS_WAIT should be treated as artificial, an artifact of enforcing a + * recorded schedule on concurrent differently-timed output streams. + * Simulators are suggested to not advance simulated time for #STATUS_WAIT while + * they should advance time for #STATUS_IDLE as the latter indicates a true + * lack of work. */ STATUS_WAIT, STATUS_INVALID, /**< Error condition. */ @@ -120,6 +123,15 @@ template class scheduler_tmpl_t { STATUS_NOT_IMPLEMENTED, /**< Feature not implemented. */ STATUS_SKIPPED, /**< Used for internal scheduler purposes. */ STATUS_RECORD_FAILED, /**< Failed to record schedule for future replay. */ + /** + * This code indicates that all inputs are blocked waiting for kernel resources + * (such as i/o). This is similar to #STATUS_WAIT, but #STATUS_WAIT indicates an + * artificial pause due to imposing the original ordering while #STATUS_IDLE + * indicates actual idle time in the application. Simulators are suggested + * to not advance simulated time for #STATUS_WAIT while they should advance + * time for #STATUS_IDLE. + */ + STATUS_IDLE, }; /** Identifies an input stream by its index. */ @@ -629,7 +641,7 @@ template class scheduler_tmpl_t { /** * Disables or re-enables this output stream. If "active" is false, this * stream becomes inactive and its currently assigned input is moved to the - * ready queue to be scheduled on other outputs. The #STATUS_WAIT code is + * ready queue to be scheduled on other outputs. The #STATUS_IDLE code is * returned to next_record() for inactive streams. If "active" is true, * this stream becomes active again. * This is only supported for #MAP_TO_ANY_OUTPUT. @@ -1076,7 +1088,7 @@ template class scheduler_tmpl_t { // sched_lock_. std::vector record; int record_index = 0; - bool waiting = false; + bool waiting = false; // Waiting or idling. bool active = true; // Used for time-based quanta. uint64_t cur_time = 0; @@ -1259,6 +1271,13 @@ template class scheduler_tmpl_t { stream_status_t set_output_active(output_ordinal_t output, bool active); + // Caller must hold the input's lock. + void + mark_input_eof(input_info_t &input); + + stream_status_t + eof_or_idle(output_ordinal_t output); + /////////////////////////////////////////////////////////////////////////// // Support for ready queues for who to schedule next: @@ -1325,6 +1344,8 @@ template class scheduler_tmpl_t { flexible_queue_t ready_priority_; // Global ready queue counter used to provide FIFO for same-priority inputs. uint64_t ready_counter_ = 0; + // Count of inputs not yet at eof. + std::atomic live_input_count_; // Map from workload,tid pair to input. struct workload_tid_t { workload_tid_t(int wl, memref_tid_t tid) diff --git a/clients/drcachesim/tests/analysis_unit_tests.cpp b/clients/drcachesim/tests/analysis_unit_tests.cpp index a4fa2646801..c580d27efbb 100644 --- a/clients/drcachesim/tests/analysis_unit_tests.cpp +++ b/clients/drcachesim/tests/analysis_unit_tests.cpp @@ -156,6 +156,10 @@ test_queries() parallel_shard_memref(void *shard_data, const memref_t &memref) override { per_shard_t *shard = reinterpret_cast(shard_data); + if (memref.marker.type == TRACE_TYPE_MARKER && + (memref.marker.marker_type == TRACE_MARKER_TYPE_CORE_WAIT || + memref.marker.marker_type == TRACE_MARKER_TYPE_CORE_IDLE)) + return true; // These are our testing goals: these queries. // We have one thread for each of our NUM_INPUTS workloads. assert(shard->stream->get_output_cpuid() == shard->index); diff --git a/clients/drcachesim/tests/schedule_stats_nopreempt.templatex b/clients/drcachesim/tests/schedule_stats_nopreempt.templatex index 9f1ef413e23..8eeada43d63 100644 --- a/clients/drcachesim/tests/schedule_stats_nopreempt.templatex +++ b/clients/drcachesim/tests/schedule_stats_nopreempt.templatex @@ -4,73 +4,83 @@ Total counts: 8 threads 638938 instructions 5 total context switches - 0.0078255 CSPKI \(context switches per 1000 instructions\) + 0\.0078255 CSPKI \(context switches per 1000 instructions\) 127788 instructions per context switch 5 voluntary context switches 0 direct context switches - 100.00% voluntary switches - 0.00% direct switches + 100\.00% voluntary switches + 0\.00% direct switches 161 system calls 2 maybe-blocking system calls 0 direct switch requests 0 waits + *[0-9]* idles + *[0-9\.]*% cpu busy Core #0 counts: . threads *[0-9]* instructions . total context switches - 0.0[0-9.]* CSPKI \(context switches per 1000 instructions\) + 0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\) *[0-9]* instructions per context switch . voluntary context switches 0 direct context switches - 100.00% voluntary switches - 0.00% direct switches + 100\.00% voluntary switches + 0\.00% direct switches *[0-9]* system calls . maybe-blocking system calls 0 direct switch requests 0 waits + *[0-9]* idles + *[0-9\.]*% cpu busy Core #1 counts: . threads *[0-9]* instructions . total context switches - 0.0[0-9.]* CSPKI \(context switches per 1000 instructions\) + 0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\) *[0-9]* instructions per context switch . voluntary context switches 0 direct context switches - 100.00% voluntary switches - 0.00% direct switches + 100\.00% voluntary switches + 0\.00% direct switches *[0-9]* system calls . maybe-blocking system calls 0 direct switch requests 0 waits + *[0-9]* idles + *[0-9\.]*% cpu busy Core #2 counts: . threads *[0-9]* instructions . total context switches - 0.0[0-9.]* CSPKI \(context switches per 1000 instructions\) + 0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\) *[0-9]* instructions per context switch . voluntary context switches 0 direct context switches - 100.00% voluntary switches - 0.00% direct switches + 100\.00% voluntary switches + 0\.00% direct switches *[0-9]* system calls . maybe-blocking system calls 0 direct switch requests 0 waits + *[0-9]* idles + *[0-9\.]*% cpu busy Core #3 counts: . threads *[0-9]* instructions . total context switches - 0.0[0-9.]* CSPKI \(context switches per 1000 instructions\) + 0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\) *[0-9]* instructions per context switch . voluntary context switches 0 direct context switches - 100.00% voluntary switches - 0.00% direct switches + 100\.00% voluntary switches + 0\.00% direct switches *[0-9]* system calls . maybe-blocking system calls 0 direct switch requests 0 waits -Core #0 schedule: [A-H,]* -Core #1 schedule: [A-H,]* -Core #2 schedule: [A-H,]* -Core #3 schedule: [A-H,]* + *[0-9]* idles + *[0-9\.]*% cpu busy +Core #0 schedule: [A-Ha-h_]* +Core #1 schedule: [A-Ha-h_]* +Core #2 schedule: [A-Ha-h_]* +Core #3 schedule: [A-Ha-h_]* diff --git a/clients/drcachesim/tests/schedule_stats_test.cpp b/clients/drcachesim/tests/schedule_stats_test.cpp index 897ffe46fbf..48177635375 100644 --- a/clients/drcachesim/tests/schedule_stats_test.cpp +++ b/clients/drcachesim/tests/schedule_stats_test.cpp @@ -54,6 +54,7 @@ namespace drmemtrace { using ::dynamorio::drmemtrace::default_memtrace_stream_t; using ::dynamorio::drmemtrace::memref_t; using ::dynamorio::drmemtrace::memref_tid_t; +using ::dynamorio::drmemtrace::TRACE_MARKER_TYPE_CORE_IDLE; using ::dynamorio::drmemtrace::TRACE_MARKER_TYPE_CORE_WAIT; using ::dynamorio::drmemtrace::TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH; using ::dynamorio::drmemtrace::TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL; @@ -198,10 +199,65 @@ test_basic_stats() return true; } +static bool +test_idle() +{ + static constexpr int64_t TID_A = 42; + static constexpr int64_t TID_B = 142; + static constexpr int64_t TID_C = 242; + std::unordered_map tid2ord; + tid2ord[TID_A] = 0; + tid2ord[TID_B] = 1; + tid2ord[TID_C] = 2; + std::vector> memrefs = { + { + gen_instr(TID_B), + gen_instr(TID_B), + gen_marker(TID_B, TRACE_MARKER_TYPE_CORE_IDLE, 0), + gen_marker(TID_B, TRACE_MARKER_TYPE_CORE_IDLE, 0), + gen_marker(TID_B, TRACE_MARKER_TYPE_CORE_IDLE, 0), + gen_instr(TID_B), + gen_instr(TID_B), + gen_instr(TID_B), + }, + { + gen_instr(TID_C), + // Involuntary switch. + gen_instr(TID_A), + // Involuntary switch. + gen_instr(TID_C), + gen_marker(TID_C, TRACE_MARKER_TYPE_CORE_IDLE, 0), + gen_marker(TID_C, TRACE_MARKER_TYPE_CORE_IDLE, 0), + gen_marker(TID_C, TRACE_MARKER_TYPE_CORE_IDLE, 0), + gen_instr(TID_C), + gen_instr(TID_C), + // Wait. + gen_marker(TID_C, TRACE_MARKER_TYPE_CORE_WAIT, 0), + gen_marker(TID_C, TRACE_MARKER_TYPE_CORE_WAIT, 0), + gen_marker(TID_C, TRACE_MARKER_TYPE_CORE_WAIT, 0), + // Involuntary switch. + gen_instr(TID_A), + gen_instr(TID_A), + gen_instr(TID_A), + }, + }; + auto result = run_schedule_stats(memrefs, tid2ord); + assert(result.instrs == 13); + assert(result.total_switches == 3); + assert(result.voluntary_switches == 0); + assert(result.direct_switches == 0); + assert(result.syscalls == 0); + assert(result.maybe_blocking_syscalls == 0); + assert(result.direct_switch_requests == 0); + assert(result.waits == 3); + assert(result.idles == 6); + return true; +} + int test_main(int argc, const char *argv[]) { - if (test_basic_stats()) { + if (test_basic_stats() && test_idle()) { std::cerr << "schedule_stats_test passed\n"; return 0; } diff --git a/clients/drcachesim/tests/scheduler_launcher.cpp b/clients/drcachesim/tests/scheduler_launcher.cpp index bfebb7e5861..b0cac30df6e 100644 --- a/clients/drcachesim/tests/scheduler_launcher.cpp +++ b/clients/drcachesim/tests/scheduler_launcher.cpp @@ -152,9 +152,18 @@ void simulate_core(int ordinal, scheduler_t::stream_t *stream, const scheduler_t &scheduler, std::string &thread_sequence) { + // XXX: Could we share some code with the schedule_stats analysis tool? + // Some features are now duplicated in both. + static constexpr char THREAD_LETTER_INITIAL_START = 'A'; + static constexpr char THREAD_LETTER_SUBSEQUENT_START = 'a'; + static constexpr char WAIT_SYMBOL = '-'; + static constexpr char IDLE_SYMBOL = '_'; memref_t record; uint64_t micros = op_sched_time.get_value() ? get_current_microseconds() : 0; uint64_t cur_segment_instrs = 0; + bool prev_was_wait = false, prev_was_idle = false; + // Measure cpu usage by counting each next_record() as one cycle. + uint64_t cycles_total = 0, cycles_busy = 0; // Thread ids can be duplicated, so use the input ordinals to distinguish. scheduler_t::input_ordinal_t prev_input = scheduler_t::INVALID_INPUT_ORDINAL; for (scheduler_t::stream_status_t status = stream->next_record(record, micros); @@ -162,13 +171,33 @@ simulate_core(int ordinal, scheduler_t::stream_t *stream, const scheduler_t &sch status = stream->next_record(record, micros)) { if (op_sched_time.get_value()) micros = get_current_microseconds(); + ++cycles_total; + // Cache and reset here to ensure we reset on early return paths. + bool was_wait = prev_was_wait; + bool was_idle = prev_was_idle; + prev_was_wait = false; + prev_was_idle = false; if (status == scheduler_t::STATUS_WAIT) { - thread_sequence += '-'; + if (!was_wait || cur_segment_instrs == op_print_every.get_value()) + thread_sequence += WAIT_SYMBOL; + ++cur_segment_instrs; + if (cur_segment_instrs == op_print_every.get_value()) + cur_segment_instrs = 0; + prev_was_wait = true; std::this_thread::yield(); continue; - } - if (status != scheduler_t::STATUS_OK) + } else if (status == scheduler_t::STATUS_IDLE) { + if (!was_idle || cur_segment_instrs == op_print_every.get_value()) + thread_sequence += IDLE_SYMBOL; + ++cur_segment_instrs; + if (cur_segment_instrs == op_print_every.get_value()) + cur_segment_instrs = 0; + prev_was_idle = true; + std::this_thread::yield(); + continue; + } else if (status != scheduler_t::STATUS_OK) FATAL_ERROR("scheduler failed to advance: %d", status); + ++cycles_busy; if (op_verbose.get_value() >= 4) { std::ostringstream line; line << "Core #" << std::setw(2) << ordinal << " @" << std::setw(9) @@ -195,9 +224,8 @@ simulate_core(int ordinal, scheduler_t::stream_t *stream, const scheduler_t &sch scheduler_t::input_ordinal_t input = stream->get_input_stream_ordinal(); if (input != prev_input) { // We convert to letters which only works well for <=26 inputs. - if (!thread_sequence.empty()) - thread_sequence += ','; - thread_sequence += 'A' + static_cast(input % 26); + thread_sequence += + THREAD_LETTER_INITIAL_START + static_cast(input % 26); cur_segment_instrs = 0; if (op_verbose.get_value() >= 2) { std::ostringstream line; @@ -228,7 +256,8 @@ simulate_core(int ordinal, scheduler_t::stream_t *stream, const scheduler_t &sch if (type_is_instr(record.instr.type)) { ++cur_segment_instrs; if (cur_segment_instrs == op_print_every.get_value()) { - thread_sequence += 'A' + static_cast(input % 26); + thread_sequence += + THREAD_LETTER_SUBSEQUENT_START + static_cast(input % 26); cur_segment_instrs = 0; } } @@ -249,6 +278,13 @@ simulate_core(int ordinal, scheduler_t::stream_t *stream, const scheduler_t &sch } #endif } + float usage = 0; + if (cycles_total > 0) + usage = 100.f * cycles_busy / static_cast(cycles_total); + std::ostringstream line; + line << "Core #" << std::setw(2) << ordinal << " usage: " << std::setw(9) << usage + << "%\n"; + std::cerr << line.str(); } } // namespace diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp index 117e6f73e98..2e111ccd20c 100644 --- a/clients/drcachesim/tests/scheduler_unit_tests.cpp +++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp @@ -842,6 +842,10 @@ run_lockstep_simulation(scheduler_t &scheduler, int num_outputs, memref_tid_t ti sched_as_string[i] += '-'; continue; } + if (status == scheduler_t::STATUS_IDLE) { + sched_as_string[i] += '_'; + continue; + } assert(status == scheduler_t::STATUS_OK); if (type_is_instr(memref.instr.type)) { sched_as_string[i] += @@ -895,7 +899,7 @@ test_synthetic() // core alternates; with an odd number the 2nd core finishes early. // The dots are thread exits. static const char *const CORE0_SCHED_STRING = "AAACCCEEEGGGBBBDDDFFFAAA.CCC.EEE.GGG."; - static const char *const CORE1_SCHED_STRING = "BBBDDDFFFAAACCCEEEGGGBBB.DDD.FFF."; + static const char *const CORE1_SCHED_STRING = "BBBDDDFFFAAACCCEEEGGGBBB.DDD.FFF.____"; { // Test instruction quanta. std::vector sched_inputs; @@ -1048,7 +1052,7 @@ test_synthetic_time_quanta() check_next(cpu0, ++time, scheduler_t::STATUS_OK, TID_C, TRACE_TYPE_INSTR); check_next(cpu0, ++time, scheduler_t::STATUS_OK, TID_C, TRACE_TYPE_INSTR); check_next(cpu0, time, scheduler_t::STATUS_OK, TID_C, TRACE_TYPE_THREAD_EXIT); - check_next(cpu0, time, scheduler_t::STATUS_EOF); + check_next(cpu0, time, scheduler_t::STATUS_IDLE); check_next(cpu1, time, scheduler_t::STATUS_OK, TID_B, TRACE_TYPE_THREAD_EXIT); check_next(cpu1, time, scheduler_t::STATUS_EOF); if (scheduler.write_recorded_schedule() != scheduler_t::STATUS_SUCCESS) @@ -1079,7 +1083,7 @@ test_synthetic_time_quanta() for (int i = 0; i < NUM_OUTPUTS; i++) { std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n"; } - assert(sched_as_string[0] == "..A..CCC."); + assert(sched_as_string[0] == "..A..CCC._"); assert(sched_as_string[1] == "..BAA.BB."); } #endif @@ -1158,8 +1162,9 @@ test_synthetic_with_timestamps() // workloads we should start with {C,F,I,J} and then move on to {B,E,H} and finish // with {A,D,G}. We should interleave within each group -- except once we reach J // we should completely finish it. - assert(sched_as_string[0] == - ".CC.C.II.IC.CC.F.FF.I.II.FF.F..BB.B.HH.HE.EE.BB.B.HH.H..DD.DA.AA.G.GG.DD.D."); + assert( + sched_as_string[0] == + ".CC.C.II.IC.CC.F.FF.I.II.FF.F..BB.B.HH.HE.EE.BB.B.HH.H..DD.DA.AA.G.GG.DD.D._"); assert(sched_as_string[1] == ".FF.F.JJ.JJ.JJ.JJ.J.CC.C.II.I..EE.EB.BB.H.HH.EE.E..AA.A.GG.GD.DD.AA.A.GG.G."); } @@ -1241,8 +1246,9 @@ test_synthetic_with_priorities() // See the test_synthetic_with_timestamps() test which has our base sequence. // We've elevated B, E, and H to higher priorities so they go // first. J remains uninterrupted due to lower timestamps. - assert(sched_as_string[0] == - ".BB.B.HH.HE.EE.BB.B.HH.H..FF.F.JJ.JJ.JJ.JJ.J.CC.C.II.I..DD.DA.AA.G.GG.DD.D."); + assert( + sched_as_string[0] == + ".BB.B.HH.HE.EE.BB.B.HH.H..FF.F.JJ.JJ.JJ.JJ.J.CC.C.II.I..DD.DA.AA.G.GG.DD.D._"); assert(sched_as_string[1] == ".EE.EB.BB.H.HH.EE.E..CC.C.II.IC.CC.F.FF.I.II.FF.F..AA.A.GG.GD.DD.AA.A.GG.G."); } @@ -1308,11 +1314,11 @@ test_synthetic_with_bindings() std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n"; } // We have {A,B,C} on {2,4}, {D,E,F} on {0,1}, and {G,H,I} on {1,2,3}: - assert(sched_as_string[0] == ".DD.D.FF.FD.DD.F.FF.DD.D.FF.F."); - assert(sched_as_string[1] == ".EE.E.HH.HE.EE.I.II.EE.E."); + assert(sched_as_string[0] == ".DD.D.FF.FD.DD.F.FF.DD.D.FF.F._"); + assert(sched_as_string[1] == ".EE.E.HH.HE.EE.I.II.EE.E.______"); assert(sched_as_string[2] == ".AA.A.CC.CG.GG.C.CC.HH.H.CC.C."); - assert(sched_as_string[3] == ".GG.G.II.IH.HH.GG.G.II.I."); - assert(sched_as_string[4] == ".BB.BA.AA.B.BB.AA.A.BB.B."); + assert(sched_as_string[3] == ".GG.G.II.IH.HH.GG.G.II.I._____"); + assert(sched_as_string[4] == ".BB.BA.AA.B.BB.AA.A.BB.B._____"); } static void @@ -1379,11 +1385,11 @@ test_synthetic_with_bindings_weighted() std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n"; } // We have {A,B,C} on {2,4}, {D,E,F} on {0,1}, and {G,H,I} on {1,2,3}: - assert(sched_as_string[0] == ".FF.FF.FF.FF.F..EE.EE.EE.EE.E."); - assert(sched_as_string[1] == ".II.II.II.II.I..DD.DD.DD.DD.D."); - assert(sched_as_string[2] == ".CC.CC.CC.CC.C..AA.AA.AA.AA.A."); + assert(sched_as_string[0] == ".FF.FF.FF.FF.F..EE.EE.EE.EE.E._"); + assert(sched_as_string[1] == ".II.II.II.II.I..DD.DD.DD.DD.D._"); + assert(sched_as_string[2] == ".CC.CC.CC.CC.C..AA.AA.AA.AA.A._"); assert(sched_as_string[3] == ".HH.HH.HH.HH.H..GG.GG.GG.GG.G."); - assert(sched_as_string[4] == ".BB.BB.BB.BB.B."); + assert(sched_as_string[4] == ".BB.BB.BB.BB.B._______________"); } static void @@ -1472,7 +1478,7 @@ test_synthetic_with_syscalls_multiple() // blocking syscall. assert(sched_as_string[0] == ".B..HH.H.B.H.HH..B.HH.H..B.E.B...II.I.JJ.JJ.JJ.JJ.J.CC.C.II.I..DD.DA.AA.G.GG." - "DD.D."); + "DD.D.___"); assert(sched_as_string[1] == ".EE..B..EE..B..EE..B..EE...CC.C.FF.FB..C.CC.F.FF.I.II.FF.F..AA.A.GG.GD.DD.AA." "A.GG.G."); @@ -1538,7 +1544,7 @@ test_synthetic_with_syscalls_single() std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n"; } assert(sched_as_string[0] == ".AA..AA.A.A.AA..A."); - assert(sched_as_string[1] == ""); + assert(sched_as_string[1] == "__________________"); } static bool @@ -1778,7 +1784,7 @@ simulate_core(scheduler_t::stream_t *stream) memref_t record; for (scheduler_t::stream_status_t status = stream->next_record(record); status != scheduler_t::STATUS_EOF; status = stream->next_record(record)) { - if (status == scheduler_t::STATUS_WAIT) { + if (status == scheduler_t::STATUS_WAIT || status == scheduler_t::STATUS_IDLE) { std::this_thread::yield(); continue; } @@ -1998,7 +2004,7 @@ test_replay() // We expect 3 letter sequences (our quantum) alternating every-other as each // core alternates; with an odd number the 2nd core finishes early. static const char *const CORE0_SCHED_STRING = "AAACCCEEEGGGBBBDDDFFFAAA.CCC.EEE.GGG."; - static const char *const CORE1_SCHED_STRING = "BBBDDDFFFAAACCCEEEGGGBBB.DDD.FFF."; + static const char *const CORE1_SCHED_STRING = "BBBDDDFFFAAACCCEEEGGGBBB.DDD.FFF.____"; static constexpr memref_tid_t TID_BASE = 100; std::vector inputs[NUM_INPUTS]; @@ -2090,7 +2096,7 @@ simulate_core_and_record_schedule(scheduler_t::stream_t *stream, memtrace_stream_t *prev_stream = nullptr; for (scheduler_t::stream_status_t status = stream->next_record(record); status != scheduler_t::STATUS_EOF; status = stream->next_record(record)) { - if (status == scheduler_t::STATUS_WAIT) { + if (status == scheduler_t::STATUS_WAIT || status == scheduler_t::STATUS_IDLE) { std::this_thread::yield(); continue; } @@ -2287,7 +2293,7 @@ test_replay_timestamps() // Create a record file with timestamps requiring waiting. // We cooperate with the test_scheduler_t class which constructs this schedule: - static const char *const CORE0_SCHED_STRING = ".AAA-------------------------CCC."; + static const char *const CORE0_SCHED_STRING = ".AAA-------------------------CCC.____"; static const char *const CORE1_SCHED_STRING = ".BBB.CCCCCC.DDDAAABBBDDDAAA.BBB.DDD."; std::string record_fname = "tmp_test_replay_timestamp.zip"; test_scheduler_t test_scheduler; @@ -2515,7 +2521,8 @@ test_replay_limit() memref_t memref; for (scheduler_t::stream_status_t status = stream->next_record(memref); status != scheduler_t::STATUS_EOF; status = stream->next_record(memref)) { - if (status == scheduler_t::STATUS_WAIT) { + if (status == scheduler_t::STATUS_WAIT || + status == scheduler_t::STATUS_IDLE) { std::this_thread::yield(); continue; } @@ -2696,7 +2703,7 @@ test_replay_as_traced() // Synthesize a cpu-schedule file. std::string cpu_fname = "tmp_test_cpu_as_traced.zip"; - static const char *const CORE0_SCHED_STRING = "EEE-AAA-CCCAAACCCBBB.DDD."; + static const char *const CORE0_SCHED_STRING = "EEE-AAA-CCCAAACCCBBB.DDD.___"; static const char *const CORE1_SCHED_STRING = "---EEE.BBBDDDBBBDDDAAA.CCC."; { std::vector sched0; @@ -3018,11 +3025,11 @@ test_inactive() // Make cpu1 inactive. status = stream1->set_active(false); assert(status == scheduler_t::STATUS_OK); - check_next(stream1, scheduler_t::STATUS_WAIT); + check_next(stream1, scheduler_t::STATUS_IDLE); // Test making cpu1 inactive while it's already inactive. status = stream1->set_active(false); assert(status == scheduler_t::STATUS_OK); - check_next(stream1, scheduler_t::STATUS_WAIT); + check_next(stream1, scheduler_t::STATUS_IDLE); // Advance cpu0 to its quantum end. check_next(stream0, scheduler_t::STATUS_OK, TID_A, TRACE_TYPE_INSTR); // Ensure cpu0 now picks up the input that was on cpu1. @@ -3034,7 +3041,7 @@ test_inactive() // Make cpu0 inactive and cpu1 active. status = stream0->set_active(false); assert(status == scheduler_t::STATUS_OK); - check_next(stream0, scheduler_t::STATUS_WAIT); + check_next(stream0, scheduler_t::STATUS_IDLE); status = stream1->set_active(true); assert(status == scheduler_t::STATUS_OK); // Now cpu1 should finish things. @@ -3070,7 +3077,7 @@ test_inactive() for (int i = 0; i < NUM_OUTPUTS; i++) { std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n"; } - assert(sched_as_string[0] == "..AABBA."); + assert(sched_as_string[0] == "..AABBA._"); assert(sched_as_string[1] == "..B---B."); } #endif // HAS_ZIP diff --git a/clients/drcachesim/tools/basic_counts.cpp b/clients/drcachesim/tools/basic_counts.cpp index 620d28a02d8..0fbf02a47c5 100644 --- a/clients/drcachesim/tools/basic_counts.cpp +++ b/clients/drcachesim/tools/basic_counts.cpp @@ -169,7 +169,8 @@ basic_counts_t::parallel_shard_memref(void *shard_data, const memref_t &memref) } else if (memref.marker.marker_type == TRACE_MARKER_TYPE_KERNEL_EVENT || memref.marker.marker_type == TRACE_MARKER_TYPE_KERNEL_XFER) { ++counters->xfer_markers; - } else if (memref.marker.marker_type == TRACE_MARKER_TYPE_CORE_WAIT) { + } else if (memref.marker.marker_type == TRACE_MARKER_TYPE_CORE_WAIT || + memref.marker.marker_type == TRACE_MARKER_TYPE_CORE_IDLE) { // This is a synthetic record so do not increment any counts. } else { if (memref.marker.marker_type == TRACE_MARKER_TYPE_WINDOW_ID && diff --git a/clients/drcachesim/tools/schedule_stats.cpp b/clients/drcachesim/tools/schedule_stats.cpp index ad12644a3b5..67bfe69b0e5 100644 --- a/clients/drcachesim/tools/schedule_stats.cpp +++ b/clients/drcachesim/tools/schedule_stats.cpp @@ -136,9 +136,10 @@ schedule_stats_t::parallel_shard_error(void *shard_data) bool schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref) { - static constexpr char THREAD_LETTER_START = 'A'; - static constexpr char THREAD_SEPARATOR = ','; + static constexpr char THREAD_LETTER_INITIAL_START = 'A'; + static constexpr char THREAD_LETTER_SUBSEQUENT_START = 'a'; static constexpr char WAIT_SYMBOL = '-'; + static constexpr char IDLE_SYMBOL = '_'; per_shard_t *shard = reinterpret_cast(shard_data); if (knob_verbose_ >= 4) { std::ostringstream line; @@ -159,17 +160,38 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref line << "\n"; std::cerr << line.str(); } + // Cache and reset here to ensure we reset on early return paths. + bool was_wait = shard->prev_was_wait; + bool was_idle = shard->prev_was_idle; + shard->prev_was_wait = false; + shard->prev_was_idle = false; if (memref.marker.type == TRACE_TYPE_MARKER && memref.marker.marker_type == TRACE_MARKER_TYPE_CORE_WAIT) { ++shard->counters.waits; - if (!shard->prev_was_wait) { - shard->thread_sequence += '-'; + shard->prev_was_wait = true; + if (!was_wait) { + shard->thread_sequence += WAIT_SYMBOL; shard->cur_segment_instrs = 0; - shard->prev_was_wait = true; } else { ++shard->cur_segment_instrs; if (shard->cur_segment_instrs == knob_print_every_) { shard->thread_sequence += WAIT_SYMBOL; + shard->cur_segment_instrs = 0; + } + } + return true; + } else if (memref.marker.type == TRACE_TYPE_MARKER && + memref.marker.marker_type == TRACE_MARKER_TYPE_CORE_IDLE) { + ++shard->counters.idles; + shard->prev_was_idle = true; + if (!was_idle) { + shard->thread_sequence += IDLE_SYMBOL; + shard->cur_segment_instrs = 0; + } else { + ++shard->cur_segment_instrs; + if (shard->cur_segment_instrs == knob_print_every_) { + shard->thread_sequence += IDLE_SYMBOL; + shard->cur_segment_instrs = 0; } } return true; @@ -183,12 +205,9 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref ++shard->counters.voluntary_switches; if (shard->direct_switch_target == memref.marker.tid) ++shard->counters.direct_switches; - // A comma separating each sequence makes it a little easier to - // read, and helps distinguish a switch from two threads with the - // same %26 letter. (We could remove this though to compact it.) - shard->thread_sequence += THREAD_SEPARATOR; } - shard->thread_sequence += THREAD_LETTER_START + static_cast(input % 26); + shard->thread_sequence += + THREAD_LETTER_INITIAL_START + static_cast(input % 26); shard->cur_segment_instrs = 0; if (knob_verbose_ >= 2) { std::ostringstream line; @@ -213,7 +232,8 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref ++shard->counters.instrs; ++shard->cur_segment_instrs; if (shard->cur_segment_instrs == knob_print_every_) { - shard->thread_sequence += THREAD_LETTER_START + static_cast(input % 26); + shard->thread_sequence += + THREAD_LETTER_SUBSEQUENT_START + static_cast(input % 26); shard->cur_segment_instrs = 0; } shard->direct_switch_target = INVALID_THREAD_ID; @@ -236,7 +256,6 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref } } else if (memref.exit.type == TRACE_TYPE_THREAD_EXIT) shard->saw_exit = true; - shard->prev_was_wait = false; return true; } @@ -246,11 +265,15 @@ schedule_stats_t::print_counters(const counters_t &counters) std::cerr << std::setw(12) << counters.threads.size() << " threads\n"; std::cerr << std::setw(12) << counters.instrs << " instructions\n"; std::cerr << std::setw(12) << counters.total_switches << " total context switches\n"; - std::cerr << std::setw(12) << std::fixed << std::setprecision(7) - << (1000 * counters.total_switches / static_cast(counters.instrs)) + double cspki = 0.; + if (counters.instrs > 0) + cspki = 1000 * counters.total_switches / static_cast(counters.instrs); + std::cerr << std::setw(12) << std::fixed << std::setprecision(7) << cspki << " CSPKI (context switches per 1000 instructions)\n"; - std::cerr << std::setw(12) << std::fixed << std::setprecision(0) - << (counters.instrs / static_cast(counters.total_switches)) + double ipcs = 0.; + if (counters.total_switches > 0) + ipcs = counters.instrs / static_cast(counters.total_switches); + std::cerr << std::setw(12) << std::fixed << std::setprecision(0) << ipcs << " instructions per context switch\n"; std::cerr << std::setw(12) << std::fixed << std::setprecision(7) << counters.voluntary_switches << " voluntary context switches\n"; @@ -273,6 +296,11 @@ schedule_stats_t::print_counters(const counters_t &counters) std::cerr << std::setw(12) << counters.direct_switch_requests << " direct switch requests\n"; std::cerr << std::setw(12) << counters.waits << " waits\n"; + std::cerr << std::setw(12) << counters.idles << " idles\n"; + std::cerr << std::setw(12) << std::setprecision(2) + << 100 * + (counters.instrs / static_cast(counters.instrs + counters.idles)) + << "% cpu busy\n"; } bool diff --git a/clients/drcachesim/tools/schedule_stats.h b/clients/drcachesim/tools/schedule_stats.h index 073d754b6e8..9d15ccc0638 100644 --- a/clients/drcachesim/tools/schedule_stats.h +++ b/clients/drcachesim/tools/schedule_stats.h @@ -88,6 +88,7 @@ class schedule_stats_t : public analysis_tool_t { maybe_blocking_syscalls += rhs.maybe_blocking_syscalls; direct_switch_requests += rhs.direct_switch_requests; waits += rhs.waits; + idles += rhs.idles; for (const memref_tid_t tid : rhs.threads) { threads.insert(tid); } @@ -101,6 +102,7 @@ class schedule_stats_t : public analysis_tool_t { int64_t maybe_blocking_syscalls = 0; int64_t direct_switch_requests = 0; int64_t waits = 0; + int64_t idles = 0; std::unordered_set threads; }; counters_t @@ -121,6 +123,7 @@ class schedule_stats_t : public analysis_tool_t { std::string thread_sequence; uint64_t cur_segment_instrs = 0; bool prev_was_wait = false; + bool prev_was_idle = false; }; void diff --git a/clients/drcachesim/tools/view.cpp b/clients/drcachesim/tools/view.cpp index fcda9cb7682..2c77e04d682 100644 --- a/clients/drcachesim/tools/view.cpp +++ b/clients/drcachesim/tools/view.cpp @@ -432,6 +432,7 @@ view_t::parallel_shard_memref(void *shard_data, const memref_t &memref) case TRACE_MARKER_TYPE_CORE_WAIT: std::cerr << "\n"; break; + case TRACE_MARKER_TYPE_CORE_IDLE: std::cerr << "\n"; break; default: std::cerr << "\n"; diff --git a/core/arch/arch.c b/core/arch/arch.c index 0273431ff49..40d614202fe 100644 --- a/core/arch/arch.c +++ b/core/arch/arch.c @@ -3866,6 +3866,14 @@ set_stolen_reg_val(priv_mcontext_t *mc, reg_t newval) { *(reg_t *)(((byte *)mc) + opnd_get_reg_dcontext_offs(dr_reg_stolen)) = newval; } + +# ifdef RISCV64 +void +set_tp_reg_val(priv_mcontext_t *mc, reg_t newval) +{ + *(reg_t *)(((byte *)mc) + opnd_get_reg_dcontext_offs(DR_REG_TP)) = newval; +} +# endif #endif #ifdef PROFILE_RDTSC diff --git a/core/arch/arch.h b/core/arch/arch.h index d09cec7b556..77796b1c12d 100644 --- a/core/arch/arch.h +++ b/core/arch/arch.h @@ -1569,7 +1569,7 @@ translate_x86_to_x64(dcontext_t *dcontext, instrlist_t *ilist, DR_PARAM_INOUT instr_t **instr); #endif -#ifdef AARCHXX +#if defined(AARCHXX) || defined(RISCV64) bool instr_is_ldstex_mangling(dcontext_t *dcontext, instr_t *inst); #endif diff --git a/core/arch/arch_exports.h b/core/arch/arch_exports.h index 3c927cd9907..8eab49dc6ec 100644 --- a/core/arch/arch_exports.h +++ b/core/arch/arch_exports.h @@ -470,6 +470,10 @@ reg_t get_stolen_reg_val(priv_mcontext_t *context); void set_stolen_reg_val(priv_mcontext_t *mc, reg_t newval); +# ifdef RISCV64 +void +set_tp_reg_val(priv_mcontext_t *mc, reg_t newval); +# endif #endif const char * get_branch_type_name(ibl_branch_type_t branch_type); diff --git a/core/arch/riscv64/mangle.c b/core/arch/riscv64/mangle.c index d7446c3c45d..e51a532be5e 100644 --- a/core/arch/riscv64/mangle.c +++ b/core/arch/riscv64/mangle.c @@ -272,8 +272,7 @@ patch_mov_immed_arch(dcontext_t *dcontext, ptr_int_t val, byte *pc, instr_t *fir bool instr_check_xsp_mangling(dcontext_t *dcontext, instr_t *inst, int *xsp_adjust) { - /* FIXME i#3544: Not implemented */ - ASSERT_NOT_IMPLEMENTED(false); + /* Does not apply to RISC-V. */ return false; } @@ -648,6 +647,48 @@ mangle_special_registers(dcontext_t *dcontext, instrlist_t *ilist, instr_t *inst return next_instr; } +/*************************************************************************** + * LR/SC sequence mangling. + */ + +bool +instr_is_ldstex_mangling(dcontext_t *dcontext, instr_t *instr) +{ + /* This should be kept in sync with mangle_exclusive_monitor_op(). */ + if (!instr_is_our_mangling(instr)) + return false; + + opnd_t memop = opnd_create_null(); + if (instr_get_opcode(instr) == OP_sd) + memop = instr_get_src(instr, 0); + else if (instr_get_opcode(instr) == OP_ld) + memop = instr_get_dst(instr, 0); + if (opnd_is_base_disp(memop)) { + ASSERT(opnd_get_index(memop) == DR_REG_NULL && opnd_get_scale(memop) == 0); + uint offs = opnd_get_disp(memop); + if (opnd_get_base(memop) == dr_reg_stolen && offs >= TLS_LRSC_ADDR_SLOT && + offs <= TLS_LRSC_SIZE_SLOT) + return true; + } + + ptr_int_t val; + if (instr_get_opcode(instr) == OP_fence || instr_get_opcode(instr) == OP_bne || + /* Check for sc.w/d+bne+jal pattern. */ + (instr_get_opcode(instr) == OP_jal && instr_get_prev(instr) != NULL && + instr_get_opcode(instr_get_prev(instr)) == OP_bne && + instr_get_prev(instr_get_prev(instr)) != NULL && + instr_is_exclusive_store(instr_get_prev(instr_get_prev(instr)))) || + instr_is_exclusive_load(instr) || instr_is_exclusive_store(instr) || + (instr_is_mov_constant(instr, &val) && + /* XXX: These are fragile, should we look backward a bit to check for more + specific patterns? */ + (val == 1 /* cas fail */ || val == -1 /* reservation invalidation */ || + val == 4 /* lr.w/sc.w size */ || val == 8 /* lr.d/sc.d size */))) + return true; + + return false; +} + static instr_t * mangle_exclusive_load(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr, instr_t *next_instr) diff --git a/core/arch/x86_code.c b/core/arch/x86_code.c index e70edeadd18..5de603c59ec 100644 --- a/core/arch/x86_code.c +++ b/core/arch/x86_code.c @@ -306,7 +306,7 @@ new_thread_setup(priv_mcontext_t *mc) ASSERT(rc != -1); /* this better be a new thread */ dcontext = get_thread_private_dcontext(); ASSERT(dcontext != NULL); -# ifdef AARCHXX +# if defined(AARCHXX) || defined(RISCV64) set_app_lib_tls_base_from_clone_record(dcontext, crec); # endif # ifdef ARM diff --git a/core/ir/aarch64/codec.c b/core/ir/aarch64/codec.c index aeb7d7a5cdf..eefcb0e9385 100644 --- a/core/ir/aarch64/codec.c +++ b/core/ir/aarch64/codec.c @@ -7950,6 +7950,99 @@ memory_transfer_size_from_dtype(uint enc) return opnd_size_from_bytes((1 << insz) * elements); } +static inline bool +decode_svemem_vec_sd_gpr16(uint size_bit, uint enc, int opcode, byte *pc, + OUT opnd_t *opnd) +{ + const aarch64_reg_offset msz = BITS(enc, 24, 23); + const uint scale = 1 << msz; + + uint single_bit_value = 0; + + if (size_bit == 22) + single_bit_value = 1; + + const aarch64_reg_offset element_size = + BITS(enc, size_bit, size_bit) == single_bit_value ? SINGLE_REG : DOUBLE_REG; + + const opnd_size_t mem_transfer = + opnd_size_from_bytes(scale * get_elements_in_sve_vector(element_size)); + + const reg_id_t zn = decode_vreg(Z_REG, extract_uint(enc, 5, 5)); + ASSERT(reg_is_z(zn)); + + const reg_id_t xm = decode_reg(extract_uint(enc, 16, 5), true, false /* XZR */); + ASSERT(reg_is_gpr(xm)); + + *opnd = opnd_create_vector_base_disp_aarch64( + zn, xm, get_opnd_size_from_offset(element_size), DR_EXTEND_UXTX, false, 0, 0, + mem_transfer, 0); + return true; +} + +static inline bool +encode_svemem_vec_sd_gpr16(uint size_bit, uint enc, int opcode, byte *pc, opnd_t opnd, + OUT uint *enc_out) +{ + + uint single_bit_value = 0; + + if (size_bit == 22) + single_bit_value = 1; + + // Element size is a part of the constant bits + const aarch64_reg_offset element_size = + BITS(enc, size_bit, size_bit) == single_bit_value ? SINGLE_REG : DOUBLE_REG; + + if (!opnd_is_base_disp(opnd) || opnd_get_index(opnd) == DR_REG_NULL || + get_vector_element_reg_offset(opnd) != element_size) + return false; + + bool index_scaled; + uint index_scale_amount; + if (opnd_get_index_extend(opnd, &index_scaled, &index_scale_amount) != + DR_EXTEND_UXTX || + index_scaled || index_scale_amount != 0) + return false; + + uint zreg_number; + opnd_size_t reg_size = OPSZ_SCALABLE; + IF_RETURN_FALSE(!encode_vreg(®_size, &zreg_number, opnd_get_base(opnd))) + + const aarch64_reg_offset msz = BITS(enc, 24, 23); + const uint scale = 1 << msz; + + const opnd_size_t mem_transfer = + opnd_size_from_bytes(scale * get_elements_in_sve_vector(element_size)); + IF_RETURN_FALSE(opnd_get_size(opnd) != mem_transfer) + + uint xreg_number; + bool is_x = false; + IF_RETURN_FALSE(!encode_reg(&xreg_number, &is_x, opnd_get_index(opnd), false) || + !is_x) + + *enc_out |= (xreg_number << 16) | (zreg_number << 5); + return true; +} + +/* + * svemem_vec_sssd_gpr16: SVE memory address with GPR offset [.S/D{, }], + * size determined by bit 22 + */ + +static inline bool +decode_opnd_svemem_vec_22sd_gpr16(uint enc, int opcode, byte *pc, OUT opnd_t *opnd) +{ + return decode_svemem_vec_sd_gpr16(22, enc, opcode, pc, opnd); +} + +static inline bool +encode_opnd_svemem_vec_22sd_gpr16(uint enc, int opcode, byte *pc, opnd_t opnd, + OUT uint *enc_out) +{ + return encode_svemem_vec_sd_gpr16(22, enc, opcode, pc, opnd, enc_out); +} + /* SVE memory operand [{, #, MUL VL}] 1 dest register */ static inline bool @@ -8349,66 +8442,16 @@ encode_opnd_x16imm(uint enc, int opcode, byte *pc, opnd_t opnd, OUT uint *enc_ou /* svemem_vec_sd_gpr16: SVE memory address with GPR offset [.S/D{, }] */ static inline bool -decode_opnd_svemem_vec_sd_gpr16(uint enc, int opcode, byte *pc, OUT opnd_t *opnd) +decode_opnd_svemem_vec_30sd_gpr16(uint enc, int opcode, byte *pc, OUT opnd_t *opnd) { - const aarch64_reg_offset msz = BITS(enc, 24, 23); - const uint scale = 1 << msz; - - const aarch64_reg_offset element_size = - BITS(enc, 30, 30) > 0 ? DOUBLE_REG : SINGLE_REG; - - const opnd_size_t mem_transfer = - opnd_size_from_bytes(scale * get_elements_in_sve_vector(element_size)); - - const reg_id_t zn = decode_vreg(Z_REG, extract_uint(enc, 5, 5)); - ASSERT(reg_is_z(zn)); - - const reg_id_t xm = decode_reg(extract_uint(enc, 16, 5), true, false /* XZR */); - ASSERT(reg_is_gpr(xm)); - - *opnd = opnd_create_vector_base_disp_aarch64( - zn, xm, get_opnd_size_from_offset(element_size), DR_EXTEND_UXTX, false, 0, 0, - mem_transfer, 0); - return true; + return decode_svemem_vec_sd_gpr16(30, enc, opcode, pc, opnd); } static inline bool -encode_opnd_svemem_vec_sd_gpr16(uint enc, int opcode, byte *pc, opnd_t opnd, - OUT uint *enc_out) +encode_opnd_svemem_vec_30sd_gpr16(uint enc, int opcode, byte *pc, opnd_t opnd, + OUT uint *enc_out) { - // Element size is a part of the constant bits - const aarch64_reg_offset element_size = - BITS(enc, 30, 30) > 0 ? DOUBLE_REG : SINGLE_REG; - - if (!opnd_is_base_disp(opnd) || opnd_get_index(opnd) == DR_REG_NULL || - get_vector_element_reg_offset(opnd) != element_size) - return false; - - bool index_scaled; - uint index_scale_amount; - if (opnd_get_index_extend(opnd, &index_scaled, &index_scale_amount) != - DR_EXTEND_UXTX || - index_scaled || index_scale_amount != 0) - return false; - - uint zreg_number; - opnd_size_t reg_size = OPSZ_SCALABLE; - IF_RETURN_FALSE(!encode_vreg(®_size, &zreg_number, opnd_get_base(opnd))) - - const aarch64_reg_offset msz = BITS(enc, 24, 23); - const uint scale = 1 << msz; - - const opnd_size_t mem_transfer = - opnd_size_from_bytes(scale * get_elements_in_sve_vector(element_size)); - IF_RETURN_FALSE(opnd_get_size(opnd) != mem_transfer) - - uint xreg_number; - bool is_x = false; - IF_RETURN_FALSE(!encode_reg(&xreg_number, &is_x, opnd_get_index(opnd), false) || - !is_x) - - *enc_out |= (xreg_number << 16) | (zreg_number << 5); - return true; + return encode_svemem_vec_sd_gpr16(30, enc, opcode, pc, opnd, enc_out); } /* index3: index of D subreg in Q register: 0-1 */ diff --git a/core/ir/aarch64/codec_sve2.txt b/core/ir/aarch64/codec_sve2.txt index a92170a944c..767c4a95275 100644 --- a/core/ir/aarch64/codec_sve2.txt +++ b/core/ir/aarch64/codec_sve2.txt @@ -86,11 +86,18 @@ 01100100101xxxxx0110x1xxxxxxxxxx n 1070 SVE2 fmlslt z_s_0 : z_s_0 z_msz_bhsd_5 z3_msz_bhsd_16 i3_index_11 01000101xx1xxxxx110xxxxxxxxxxxxx n 1145 SVE2 histcnt z_size_sd_0 : p10_zer_lo z_size_sd_5 z_size_sd_16 01000101001xxxxx101000xxxxxxxxxx n 1071 SVE2 histseg z_b_0 : z_b_5 z_b_16 -11000100000xxxxx100xxxxxxxxxxxxx n 1186 SVE2 ldnt1sb z_d_0 : svemem_vec_sd_gpr16 p10_zer_lo -10000100000xxxxx100xxxxxxxxxxxxx n 1186 SVE2 ldnt1sb z_s_0 : svemem_vec_sd_gpr16 p10_zer_lo -11000100100xxxxx100xxxxxxxxxxxxx n 1187 SVE2 ldnt1sh z_d_0 : svemem_vec_sd_gpr16 p10_zer_lo -10000100100xxxxx100xxxxxxxxxxxxx n 1187 SVE2 ldnt1sh z_s_0 : svemem_vec_sd_gpr16 p10_zer_lo -11000101000xxxxx100xxxxxxxxxxxxx n 1188 SVE2 ldnt1sw z_d_0 : svemem_vec_sd_gpr16 p10_zer_lo +11000100000xxxxx110xxxxxxxxxxxxx n 950 SVE2 ldnt1b z_d_0 : svemem_vec_30sd_gpr16 p10_zer_lo +10000100000xxxxx101xxxxxxxxxxxxx n 950 SVE2 ldnt1b z_s_0 : svemem_vec_30sd_gpr16 p10_zer_lo +11000101100xxxxx110xxxxxxxxxxxxx n 992 SVE2 ldnt1d z_d_0 : svemem_vec_30sd_gpr16 p10_zer_lo +11000100100xxxxx110xxxxxxxxxxxxx n 993 SVE2 ldnt1h z_d_0 : svemem_vec_30sd_gpr16 p10_zer_lo +10000100100xxxxx101xxxxxxxxxxxxx n 993 SVE2 ldnt1h z_s_0 : svemem_vec_30sd_gpr16 p10_zer_lo +11000100000xxxxx100xxxxxxxxxxxxx n 1186 SVE2 ldnt1sb z_d_0 : svemem_vec_30sd_gpr16 p10_zer_lo +10000100000xxxxx100xxxxxxxxxxxxx n 1186 SVE2 ldnt1sb z_s_0 : svemem_vec_30sd_gpr16 p10_zer_lo +11000100100xxxxx100xxxxxxxxxxxxx n 1187 SVE2 ldnt1sh z_d_0 : svemem_vec_30sd_gpr16 p10_zer_lo +10000100100xxxxx100xxxxxxxxxxxxx n 1187 SVE2 ldnt1sh z_s_0 : svemem_vec_30sd_gpr16 p10_zer_lo +11000101000xxxxx100xxxxxxxxxxxxx n 1188 SVE2 ldnt1sw z_d_0 : svemem_vec_30sd_gpr16 p10_zer_lo +11000101000xxxxx110xxxxxxxxxxxxx n 994 SVE2 ldnt1w z_d_0 : svemem_vec_30sd_gpr16 p10_zer_lo +10000101000xxxxx101xxxxxxxxxxxxx n 994 SVE2 ldnt1w z_s_0 : svemem_vec_30sd_gpr16 p10_zer_lo 01000101xx1xxxxx100xxxxxxxx0xxxx w 1189 SVE2 match p_size_bh_0 : p10_zer_lo z_size_bh_5 z_size_bh_16 00000100111xxxxx001111xxxxxxxxxx n 1072 SVE2 nbsl z_d_0 : z_d_0 z_d_16 z_d_5 01000101xx1xxxxx100xxxxxxxx1xxxx w 1190 SVE2 nmatch p_size_bh_0 : p10_zer_lo z_size_bh_5 z_size_bh_16 @@ -223,6 +230,13 @@ 01000101xx0xxxxx100011xxxxxxxxxx n 1116 SVE2 ssubltb z_size_hsd_0 : z_sizep1_bhs_5 z_sizep1_bhs_16 01000101xx0xxxxx010100xxxxxxxxxx n 1117 SVE2 ssubwb z_size_hsd_0 : z_size_hsd_5 z_sizep1_bhs_16 01000101xx0xxxxx010101xxxxxxxxxx n 1118 SVE2 ssubwt z_size_hsd_0 : z_size_hsd_5 z_sizep1_bhs_16 +11100100000xxxxx001xxxxxxxxxxxxx n 952 SVE2 stnt1b svemem_vec_22sd_gpr16 : z_d_0 p10_lo +11100100010xxxxx001xxxxxxxxxxxxx n 952 SVE2 stnt1b svemem_vec_22sd_gpr16 : z_s_0 p10_lo +11100101100xxxxx001xxxxxxxxxxxxx n 1004 SVE2 stnt1d svemem_vec_30sd_gpr16 : z_d_0 p10_lo +11100100100xxxxx001xxxxxxxxxxxxx n 1005 SVE2 stnt1h svemem_vec_22sd_gpr16 : z_d_0 p10_lo +11100100110xxxxx001xxxxxxxxxxxxx n 1005 SVE2 stnt1h svemem_vec_22sd_gpr16 : z_s_0 p10_lo +11100101000xxxxx001xxxxxxxxxxxxx n 1006 SVE2 stnt1w svemem_vec_22sd_gpr16 : z_d_0 p10_lo +11100101010xxxxx001xxxxxxxxxxxxx n 1006 SVE2 stnt1w svemem_vec_22sd_gpr16 : z_s_0 p10_lo 01000101xx1xxxxx011100xxxxxxxxxx n 1119 SVE2 subhnb z_sizep1_bhs_0 : z_size_hsd_5 z_size_hsd_16 01000101xx1xxxxx011101xxxxxxxxxx n 1120 SVE2 subhnt z_sizep1_bhs_0 : z_sizep1_bhs_0 z_size_hsd_5 z_size_hsd_16 01000100xx011100100xxxxxxxxxxxxx n 474 SVE2 suqadd z_size_bhsd_0 : p10_mrg_lo z_size_bhsd_0 z_size_bhsd_5 diff --git a/core/ir/aarch64/instr_create_api.h b/core/ir/aarch64/instr_create_api.h index 6dfe149ef52..6a4e2e97168 100644 --- a/core/ir/aarch64/instr_create_api.h +++ b/core/ir/aarch64/instr_create_api.h @@ -11541,6 +11541,8 @@ * \verbatim * LDNT1B { .B }, /Z, [, ] * LDNT1B { .B }, /Z, [{, #, MUL VL}] + * LDNT1B { .D }, /Z, [.D{, }] + * LDNT1B { .S }, /Z, [.S{, }] * \endverbatim * \param dc The void * dcontext used to allocate memory for the #instr_t. * \param Zt The destination vector register, Z (Scalable). @@ -11549,9 +11551,12 @@ * constructed with the function: * For the [\, \] variant: * opnd_create_base_disp_aarch64(Rn, Rm, - * DR_EXTEND_UXTX, 0, 0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / - * 8)) For the [\{, #\, MUL VL}] variant: opnd_create_base_disp(Rn, - * DR_REG_NULL, 0, imm, opnd_size_from_bytes(dr_get_sve_vector_length() / 8)) + * DR_EXTEND_UXTX, 0, 0, 0, opnd_size_from_bytes( + * dr_get_sve_vector_length() / 8)) + * For the [\{, #\, MUL VL}] variant: opnd_create_base_disp(Rn, + * DR_REG_NULL, 0, imm, opnd_size_from_bytes(dr_get_sve_vector_length() / 8)) + * For the vector+scalar variant: opnd_create_base_disp_aarch64(Zn, Rm, + * DR_EXTEND_UXTX, 0, 0, 0, OPSZ_1) */ #define INSTR_CREATE_ldnt1b_sve_pred(dc, Zt, Pg, Rn) \ instr_create_1dst_2src(dc, OP_ldnt1b, Zt, Rn, Pg) @@ -11606,6 +11611,8 @@ * \verbatim * STNT1B { .B }, , [, ] * STNT1B { .B }, , [{, #, MUL VL}] + * STNT1B { .D }, , [.D{, }] + * STNT1B { .S }, , [.S{, }] * \endverbatim * \param dc The void * dcontext used to allocate memory for the #instr_t. * \param Zt The first source vector register, Z (Scalable). @@ -11614,9 +11621,22 @@ * constructed with the function: * For the [\, \] variant: * opnd_create_base_disp_aarch64(Rn, Rm, - * DR_EXTEND_UXTX, 0, 0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / - * 8)) For the [\{, #\, MUL VL}] variant: opnd_create_base_disp(Rn, - * DR_REG_NULL, 0, imm, opnd_size_from_bytes(dr_get_sve_vector_length() / 8)) + * DR_EXTEND_UXTX, 0, 0, 0, + * opnd_size_from_bytes(dr_get_sve_vector_length() / + * 8)) + * For the [\{, #\, MUL VL}] variant: + * opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm, + * opnd_size_from_bytes(dr_get_sve_vector_length() / 8)) + * For the [\.D{, \}] variant: + * opnd_create_vector_base_disp_aarch64(Zn, Xm, OPSZ_8, + * DR_EXTEND_UXTX, 0, 0, 0, + * opnd_size_from_bytes(proc_get_vector_length_bytes() / 8), + * 0) + * For the [\.S{, \}] variant: + * opnd_create_vector_base_disp_aarch64(Zn, Xm, OPSZ_4, + * DR_EXTEND_UXTX, 0, 0, 0, + * opnd_size_from_bytes(proc_get_vector_length_bytes() / 4), + * 0) */ #define INSTR_CREATE_stnt1b_sve_pred(dc, Zt, Pg, Rn) \ instr_create_1dst_2src(dc, OP_stnt1b, Rn, Zt, Pg) @@ -12809,6 +12829,7 @@ * \verbatim * LDNT1D { .D }, /Z, [, , LSL #3] * LDNT1D { .D }, /Z, [{, #, MUL VL}] + * LDNT1D { .D }, /Z, [.D{, }] * \endverbatim * \param dc The void * dcontext used to allocate memory for the #instr_t. * \param Zt The destination vector register, Z (Scalable). @@ -12822,6 +12843,10 @@ * For the [\{, #\, MUL VL}] variant: * opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm, * opnd_size_from_bytes(dr_get_sve_vector_length() / 8)) + * For the [\.D{, \}] variant: + * opnd_create_vector_base_disp_aarch64(Zn, Xm, OPSZ_8, + * DR_EXTEND_UXTX, 0, 0, 0, + * opnd_size_from_bytes(proc_get_vector_length_bytes()), 0) */ #define INSTR_CREATE_ldnt1d_sve_pred(dc, Zt, Pg, Rn) \ instr_create_1dst_2src(dc, OP_ldnt1d, Zt, Rn, Pg) @@ -12833,6 +12858,8 @@ * \verbatim * LDNT1H { .H }, /Z, [, , LSL #1] * LDNT1H { .H }, /Z, [{, #, MUL VL}] + * LDNT1H { .D }, /Z, [.D{, }] + * LDNT1H { .S }, /Z, [.S{, }] * \endverbatim * \param dc The void * dcontext used to allocate memory for the #instr_t. * \param Zt The destination vector register, Z (Scalable). @@ -12846,6 +12873,15 @@ * For the [\{, #\, MUL VL}] variant: * opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm, * opnd_size_from_bytes(dr_get_sve_vector_length() / 8)) + * For the [\.D{, \}] variant: + * opnd_create_vector_base_disp_aarch64(Zn, Xm, OPSZ_8, + * DR_EXTEND_UXTX, 0, 0, 0, + * opnd_size_from_bytes(proc_get_vector_length_bytes() / 4), + * 0) + * For the [\.S{, \}] variant: + * opnd_create_vector_base_disp_aarch64(Zn, Xm, OPSZ_4, + * DR_EXTEND_UXTX, 0, 0, 0, + * opnd_size_from_bytes(proc_get_vector_length_bytes() / 2), 0) */ #define INSTR_CREATE_ldnt1h_sve_pred(dc, Zt, Pg, Rn) \ instr_create_1dst_2src(dc, OP_ldnt1h, Zt, Rn, Pg) @@ -12857,6 +12893,8 @@ * \verbatim * LDNT1W { .S }, /Z, [, , LSL #2] * LDNT1W { .S }, /Z, [{, #, MUL VL}] + * LDNT1W { .D }, /Z, [.D{, }] + * LDNT1W { .S }, /Z, [.S{, }] * \endverbatim * \param dc The void * dcontext used to allocate memory for the #instr_t. * \param Zt The destination vector register, Z (Scalable). @@ -12870,6 +12908,13 @@ * For the [\{, #\, MUL VL}] variant: * opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm, * opnd_size_from_bytes(dr_get_sve_vector_length() / 8)) + * For the [\.D{, \}] variant: + * opnd_create_vector_base_disp_aarch64(Zn, Xm, OPSZ_8, + * DR_EXTEND_UXTX, 0, 0, 0, + * opnd_size_from_bytes(proc_get_vector_length_bytes() / 2), 0) + * For the [\.S{, \}] variant: + * opnd_create_vector_base_disp_aarch64(Zn, Xm, OPSZ_4, DR_EXTEND_UXTX, 0, 0, + * 0, opnd_size_from_bytes(proc_get_vector_length_bytes()), 0) */ #define INSTR_CREATE_ldnt1w_sve_pred(dc, Zt, Pg, Rn) \ instr_create_1dst_2src(dc, OP_ldnt1w, Zt, Rn, Pg) @@ -13097,6 +13142,7 @@ * \verbatim * STNT1D { .D }, , [, , LSL #3] * STNT1D { .D }, , [{, #, MUL VL}] + * STNT1D { .D }, , [.D{, }] * \endverbatim * \param dc The void * dcontext used to allocate memory for the #instr_t. * \param Zt The first source vector register, Z (Scalable). @@ -13111,6 +13157,10 @@ * For the [\{, #\, MUL VL}] variant: * opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm, * opnd_size_from_bytes(dr_get_sve_vector_length() / 8)) + * For the [\.D{, \}] variant: + * opnd_create_vector_base_disp_aarch64(Zn, Xm, OPSZ_8, + * DR_EXTEND_UXTX, 0, 0, 0, + * opnd_size_from_bytes(proc_get_vector_length_bytes()), 0) */ #define INSTR_CREATE_stnt1d_sve_pred(dc, Zt, Pg, Rn) \ instr_create_1dst_2src(dc, OP_stnt1d, Rn, Zt, Pg) @@ -13122,6 +13172,8 @@ * \verbatim * STNT1H { .H }, , [, , LSL #1] * STNT1H { .H }, , [{, #, MUL VL}] + * STNT1H { .D }, , [.D{, }] + * STNT1H { .S }, , [.S{, }] * \endverbatim * \param dc The void * dcontext used to allocate memory for the #instr_t. * \param Zt The first source vector register, Z (Scalable). @@ -13135,6 +13187,14 @@ * For the [\{, #\, MUL VL}] variant: * opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm, * opnd_size_from_bytes(dr_get_sve_vector_length() / 8)) + * For the [\.D{, \}] variant: + * opnd_create_vector_base_disp_aarch64(Zn, Xm, OPSZ_8, + * DR_EXTEND_UXTX, 0, 0, 0, + * opnd_size_from_bytes(proc_get_vector_length_bytes() / 4), 0) + * For the [\.S{, \}] variant: + * opnd_create_vector_base_disp_aarch64(Zn, Xm, OPSZ_4, + * DR_EXTEND_UXTX, 0, 0, 0, + * opnd_size_from_bytes(proc_get_vector_length_bytes() / 2), 0) */ #define INSTR_CREATE_stnt1h_sve_pred(dc, Zt, Pg, Rn) \ instr_create_1dst_2src(dc, OP_stnt1h, Rn, Zt, Pg) @@ -13146,6 +13206,8 @@ * \verbatim * STNT1W { .S }, , [, , LSL #2] * STNT1W { .S }, , [{, #, MUL VL}] + * STNT1W { .D }, , [.D{, }] + * STNT1W { .S }, , [.S{, }] * \endverbatim * \param dc The void * dcontext used to allocate memory for the #instr_t. * \param Zt The first source vector register, Z (Scalable). @@ -13159,6 +13221,14 @@ * For the [\{, #\, MUL VL}] variant: * opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm, * opnd_size_from_bytes(dr_get_sve_vector_length() / 8)) + * For the [\.D{, \}] variant: + * opnd_create_vector_base_disp_aarch64(Zn, Xm, OPSZ_8, + * DR_EXTEND_UXTX, 0, 0, 0, + * opnd_size_from_bytes(proc_get_vector_length_bytes() / 2), 0) + * For the [\.S{, \}] variant: + * opnd_create_vector_base_disp_aarch64(Zn, Xm, OPSZ_4, + * DR_EXTEND_UXTX, 0, 0, 0, + * opnd_size_from_bytes(proc_get_vector_length_bytes()), 0) */ #define INSTR_CREATE_stnt1w_sve_pred(dc, Zt, Pg, Rn) \ instr_create_1dst_2src(dc, OP_stnt1w, Rn, Zt, Pg) diff --git a/core/ir/aarch64/opnd_defs.txt b/core/ir/aarch64/opnd_defs.txt index b0f68850ab8..89f355aae08 100644 --- a/core/ir/aarch64/opnd_defs.txt +++ b/core/ir/aarch64/opnd_defs.txt @@ -356,6 +356,7 @@ -------??--xxxxx------xxxxx----- sveprf_gpr_shf # SVE memory address [, , LSL #x] for prefetch operation -------??-?xxxxx------xxxxx----- svemem_gpr_vec64 # SVE memory address (64-bit offset) [, .D{, }] -------??-xxxxxxx-----xxxxx----- mem7_tag # Write bytes is fixed at 16bytes, post/pre/offset is in 24:23, with memory tag scaling +-------???-xxxxx------xxxxx----- svemem_vec_22sd_gpr16 # SVE memort operand [.S/D{, }] -------????-xxxx------xxxxx----- svemem_gpr_simm4_vl_1reg # SVE memory operand [{, #, MUL VL}] # 1 src/dest register -------????xxxxx------xxxxx----- svemem_ssz_gpr_shf # SVE memory operand [, , LSL #x] @@ -374,7 +375,7 @@ -------xx--xxxxx---------------- z_msz_bhsd_16 # z register with element size determined by msz -?--------------------xxxxx----- mem0p # gets size from 30; no offset, pair -?---------xxxxx????------------ x16imm # computes immed from 30 and 15:12 --?-----??--xxxxx------xxxxx----- svemem_vec_sd_gpr16 # SVE memory address with GPR offset [.S/D{, }] +-?-----??--xxxxx------xxxxx----- svemem_vec_30sd_gpr16 # SVE memory address with GPR offset [.S/D{, }] -x------------------------------ index3 # index of D subreg in Q: 0-1 -x-------------------------xxxxx wx0_30 # X register if bit 30 is set, else W -x-------------------------xxxxx dq0 # Q register if bit 30 is set, else D diff --git a/core/translate.c b/core/translate.c index d327e0fd751..7a6ccc4d460 100644 --- a/core/translate.c +++ b/core/translate.c @@ -132,6 +132,8 @@ instr_is_inline_syscall_jmp(dcontext_t *dcontext, instr_t *inst) /* A32 uses a regular jump */ instr_get_opcode(inst) == OP_b) && opnd_is_instr(instr_get_target(inst))); +# elif defined(RISCV64) + return (instr_get_opcode(inst) == OP_jal && opnd_is_instr(instr_get_target(inst))); # else ASSERT_NOT_IMPLEMENTED(false); return false; @@ -304,14 +306,14 @@ translate_walk_track_pre_instr(dcontext_t *tdcontext, instr_t *inst, walk->unsupported_mangle = false; walk->xsp_adjust = 0; for (reg_id_t r = 0; r < REG_SPILL_NUM; r++) { -#ifndef AARCHXX +#ifdef X86 /* we should have seen a restore for every spill, unless at * fragment-ending jump to ibl, which shouldn't come here */ ASSERT(walk->reg_spill_offs[r] == UINT_MAX); walk->reg_spill_offs[r] = UINT_MAX; /* be paranoid */ #else - /* On AArchXX we do spill registers across app instrs and mangle + /* On AArchXX/RISCV64 we do spill registers across app instrs and mangle * regions, though right now only the following routines do this: * - mangle_stolen_reg() * - mangle_gpr_list_read() @@ -399,9 +401,6 @@ translate_walk_track_post_instr(dcontext_t *tdcontext, instr_t *inst, * comment above for post-mangling traces), and so for local * spills like rip-rel and ind branches this is fine. */ -#if defined(RISCV64) - ASSERT_NOT_IMPLEMENTED(false); -#endif if (instr_is_cti(inst) && #ifdef X86 /* Do not reset for a trace-cmp jecxz or jmp (32-bit) or @@ -420,10 +419,7 @@ translate_walk_track_post_instr(dcontext_t *tdcontext, instr_t *inst, (!opnd_is_pc(instr_get_target(inst)) || (opnd_get_pc(instr_get_target(inst)) >= walk->start_cache && opnd_get_pc(instr_get_target(inst)) < walk->end_cache)))) -#elif defined(RISCV64) - /* FIXME i#3544: Not implemented */ - false -#else +#elif defined(AARCHXX) /* Do not reset for cbnz/bne in ldstex mangling, nor for the b after strex. */ !(instr_get_opcode(inst) == OP_cbnz || (instr_get_opcode(inst) == OP_b && @@ -432,6 +428,17 @@ translate_walk_track_post_instr(dcontext_t *tdcontext, instr_t *inst, (instr_get_opcode(inst) == OP_b && (instr_get_prev(inst) != NULL && instr_is_exclusive_store(instr_get_prev(inst))))) +#elif defined(RISCV64) + /* Do not reset for bne in LR/SC mangling, nor for the jal after SC. + * This should be kept in sync with mangle_exclusive_monitor_op(). + */ + !(instr_get_opcode(inst) == OP_bne || + (instr_get_opcode(inst) == OP_jal && instr_get_prev(inst) != NULL && + instr_get_opcode(instr_get_prev(inst)) == OP_bne && + instr_get_prev(instr_get_prev(inst)) != NULL && + instr_is_exclusive_store(instr_get_prev(instr_get_prev(inst))))) +#else +# error Unsupported architecture #endif ) { /* FIXME i#1551: add ARM version of the series of trace cti checks above */ @@ -547,7 +554,7 @@ translate_walk_track_post_instr(dcontext_t *tdcontext, instr_t *inst, /* nothing to do */ } #endif -#ifdef AARCHXX +#if defined(AARCHXX) || defined(RISCV64) else if (instr_is_ldstex_mangling(tdcontext, inst)) { /* nothing to do */ } diff --git a/core/unix/os_public.h b/core/unix/os_public.h index 1647dcbdb4f..79d348e9e74 100644 --- a/core/unix/os_public.h +++ b/core/unix/os_public.h @@ -205,6 +205,7 @@ typedef kernel_sigcontext_t sigcontext_t; # define SC_SYSNUM_REG SC_R7 # define SC_RETURN_REG SC_R0 #elif defined(RISCV64) +# define SC_TP SC_FIELD(sc_regs.tp) # define SC_A0 SC_FIELD(sc_regs.a0) # define SC_A1 SC_FIELD(sc_regs.a1) # define SC_A2 SC_FIELD(sc_regs.a2) diff --git a/core/unix/signal.c b/core/unix/signal.c index 01e91acee36..90767800081 100644 --- a/core/unix/signal.c +++ b/core/unix/signal.c @@ -3051,20 +3051,36 @@ mcontext_to_ucontext(kernel_ucontext_t *uc, priv_mcontext_t *mc) mcontext_to_sigcontext(&sc_full, mc, DR_MC_ALL); } -#ifdef AARCHXX +#if defined(AARCHXX) || defined(RISCV64) static void set_sigcxt_stolen_reg(sigcontext_t *sc, reg_t val) { - *(&sc->SC_R0 + (dr_reg_stolen - DR_REG_R0)) = val; + *(&sc->IF_AARCHXX_ELSE(SC_R0, SC_A0) + + (dr_reg_stolen - IF_AARCHXX_ELSE(DR_REG_R0, DR_REG_A0))) = val; } static reg_t get_sigcxt_stolen_reg(sigcontext_t *sc) { - return *(&sc->SC_R0 + (dr_reg_stolen - DR_REG_R0)); + return *(&sc->IF_AARCHXX_ELSE(SC_R0, SC_A0) + + (dr_reg_stolen - IF_AARCHXX_ELSE(DR_REG_R0, DR_REG_A0))); +} + +# ifdef RISCV64 +static void +set_sigcxt_tp_reg(sigcontext_t *sc, reg_t val) +{ + sc->SC_TP = val; } -# ifndef AARCH64 +static reg_t +get_sigcxt_tp_reg(sigcontext_t *sc) +{ + return sc->SC_TP; +} +# endif + +# ifdef ARM static dr_isa_mode_t get_pc_mode_from_cpsr(sigcontext_t *sc) { @@ -3251,7 +3267,11 @@ thread_set_self_context(void *cxt) #elif defined(ARM) asm("ldr " ASM_XSP ", %0" : : "m"(xsp_for_sigreturn)); asm("b dynamorio_sigreturn"); -#endif /* X86/AARCH64/ARM */ +#elif defined(RISCV) + ASSERT_NOT_TESTED(); + asm("addi " ASM_XSP ", %0, 0" : : "r"(xsp_for_sigreturn)); + asm("j dynamorio_sigreturn"); +#endif /* X86/AARCH64/ARM/RISCV64 */ ASSERT_NOT_REACHED(); } @@ -3370,7 +3390,6 @@ sig_has_restorer(thread_sig_info_t *info, int sig) # elif defined(RISCV64) static const byte SIGRET_NONRT[8] = { 0 }; /* unused */ static const byte SIGRET_RT[8] = { 0 }; /* unused */ - ; # endif byte buf[MAX(sizeof(SIGRET_NONRT), sizeof(SIGRET_RT))] = { 0 }; if (d_r_safe_read(info->sighand->action[sig]->restorer, sizeof(buf), buf) && @@ -4048,7 +4067,7 @@ transfer_from_sig_handler_to_fcache_return(dcontext_t *dcontext, kernel_ucontext * still go to the private fcache_return for simplicity. */ sc->SC_XIP = (ptr_uint_t)fcache_return_routine(dcontext); -#if defined(AARCHXX) +#if defined(AARCHXX) || defined(RISCV64) /* We do not have to set dr_reg_stolen in dcontext's mcontext here * because dcontext's mcontext is stale and we used the mcontext * created from recreate_app_state_internal with the original sigcontext. @@ -4062,12 +4081,14 @@ transfer_from_sig_handler_to_fcache_return(dcontext_t *dcontext, kernel_ucontext dcontext->local_state->spill_space.reg_stolen = get_sigcxt_stolen_reg(sc); /* Now put DR's base in the sigcontext. */ set_sigcxt_stolen_reg(sc, (reg_t)*get_dr_tls_base_addr()); -# ifndef AARCH64 +# ifdef RISCV64 + set_sigcxt_tp_reg(sc, (reg_t)read_thread_register(TLS_REG_LIB)); +# endif + +# ifdef ARM /* We're going to our fcache_return gencode which uses DEFAULT_ISA_MODE */ set_pc_mode_in_cpsr(sc, DEFAULT_ISA_MODE); # endif -#elif defined(RISCV64) - ASSERT_NOT_IMPLEMENTED(false); #endif #if defined(X64) || defined(ARM) @@ -4626,7 +4647,7 @@ adjust_syscall_for_restart(dcontext_t *dcontext, thread_sig_info_t *info, int si } else { ASSERT_NOT_REACHED(); /* Inlined syscalls no longer come here. */ } -#ifdef AARCHXX +#if defined(AARCHXX) || defined(RISCV64) /* dr_reg_stolen is holding DR's TLS on receiving a signal, * so we need to put the app's reg value into the ucontext instead. * The translation process normally does this for us, but here we're doing @@ -4734,7 +4755,8 @@ find_next_fragment_from_gencode(dcontext_t *dcontext, sigcontext_t *sc) if (f == NULL && sc->SC_XCX != 0) f = fragment_lookup(dcontext, (app_pc)sc->SC_XCX); #elif defined(RISCV64) -/* FIXME i#3544: Not implemented */ + /* FIXME i#3544: Not implemented */ + ASSERT_NOT_IMPLEMENTED(false); #else # error Unsupported arch. #endif @@ -6292,7 +6314,9 @@ execute_handler_from_dispatch(dcontext_t *dcontext, int sig) dump_sigcontext(dcontext, sc); LOG(THREAD, LOG_ASYNCH, 3, "\n"); } - IF_AARCHXX(ASSERT(get_sigcxt_stolen_reg(sc) != (reg_t)*get_dr_tls_base_addr())); +# if defined(AARCHXX) || defined(RISCV64) + ASSERT(get_sigcxt_stolen_reg(sc) != (reg_t)*get_dr_tls_base_addr()); +# endif #endif /* FIXME: other state? debug regs? * if no syscall allowed between main_ (when frame created) and @@ -7370,7 +7394,7 @@ handle_sigreturn(dcontext_t *dcontext, void *ucxt_param, int style) * look like whatever would happen to the app... */ ASSERT((app_pc)sc->SC_XIP != next_pc); -# if defined(AARCHXX) +# if defined(AARCHXX) || defined(RISCV64) ASSERT(get_sigcxt_stolen_reg(sc) != (reg_t)*get_dr_tls_base_addr()); /* We're called from DR and are not yet in the cache, so we want to set the * mcontext slot, not the TLS slot, to set the stolen reg value. @@ -7378,18 +7402,24 @@ handle_sigreturn(dcontext_t *dcontext, void *ucxt_param, int style) set_stolen_reg_val(get_mcontext(dcontext), get_sigcxt_stolen_reg(sc)); /* The linkstub expects DR's TLS to be in the actual register. */ set_sigcxt_stolen_reg(sc, (reg_t)*get_dr_tls_base_addr()); -# ifdef AARCH64 + +# ifdef RISCV64 + set_tp_reg_val(get_mcontext(dcontext), get_sigcxt_tp_reg(sc)); + set_sigcxt_tp_reg(sc, (reg_t)read_thread_register(TLS_REG_LIB)); +# endif + +# if defined(AARCH64) /* On entry to the do_syscall gencode, we save X1 into TLS_REG1_SLOT. * Then the sigreturn would redirect the flow to the fcache_return gencode. * In fcache_return it recovers the values of x0 and x1 from TLS_SLOT 0 and 1. */ get_mcontext(dcontext)->r1 = sc->SC_FIELD_AARCH64(1); +# elif defined(RISCV64) + get_mcontext(dcontext)->a1 = sc->SC_FIELD(sc_regs.a1); # else /* We're going to our fcache_return gencode which uses DEFAULT_ISA_MODE */ set_pc_mode_in_cpsr(sc, DEFAULT_ISA_MODE); # endif -# elif defined(RISCV64) - ASSERT_NOT_IMPLEMENTED(false); # endif #endif diff --git a/core/unix/signal_linux_riscv64.c b/core/unix/signal_linux_riscv64.c index 2fc3b406a37..da86d66ca08 100644 --- a/core/unix/signal_linux_riscv64.c +++ b/core/unix/signal_linux_riscv64.c @@ -57,7 +57,39 @@ save_fpstate(dcontext_t *dcontext, sigframe_rt_t *frame) void dump_sigcontext(dcontext_t *dcontext, sigcontext_t *sc) { - LOG(THREAD, LOG_ASYNCH, 1, "FIXME i#3544: NYI on RISCV64"); + + LOG(THREAD, LOG_ASYNCH, 1, "\tpc = " PFX "\n", sc->sc_regs.pc); + LOG(THREAD, LOG_ASYNCH, 1, "\tra = " PFX "\n", sc->sc_regs.ra); + LOG(THREAD, LOG_ASYNCH, 1, "\tsp = " PFX "\n", sc->sc_regs.sp); + LOG(THREAD, LOG_ASYNCH, 1, "\tgp = " PFX "\n", sc->sc_regs.gp); + LOG(THREAD, LOG_ASYNCH, 1, "\ttp = " PFX "\n", sc->sc_regs.tp); + LOG(THREAD, LOG_ASYNCH, 1, "\tt0 = " PFX "\n", sc->sc_regs.t0); + LOG(THREAD, LOG_ASYNCH, 1, "\tt1 = " PFX "\n", sc->sc_regs.t1); + LOG(THREAD, LOG_ASYNCH, 1, "\tt2 = " PFX "\n", sc->sc_regs.t2); + LOG(THREAD, LOG_ASYNCH, 1, "\ts0 = " PFX "\n", sc->sc_regs.s0); + LOG(THREAD, LOG_ASYNCH, 1, "\ts1 = " PFX "\n", sc->sc_regs.s1); + LOG(THREAD, LOG_ASYNCH, 1, "\ta0 = " PFX "\n", sc->sc_regs.a0); + LOG(THREAD, LOG_ASYNCH, 1, "\ta1 = " PFX "\n", sc->sc_regs.a1); + LOG(THREAD, LOG_ASYNCH, 1, "\ta2 = " PFX "\n", sc->sc_regs.a2); + LOG(THREAD, LOG_ASYNCH, 1, "\ta3 = " PFX "\n", sc->sc_regs.a3); + LOG(THREAD, LOG_ASYNCH, 1, "\ta4 = " PFX "\n", sc->sc_regs.a4); + LOG(THREAD, LOG_ASYNCH, 1, "\ta5 = " PFX "\n", sc->sc_regs.a5); + LOG(THREAD, LOG_ASYNCH, 1, "\ta6 = " PFX "\n", sc->sc_regs.a6); + LOG(THREAD, LOG_ASYNCH, 1, "\ta7 = " PFX "\n", sc->sc_regs.a7); + LOG(THREAD, LOG_ASYNCH, 1, "\ts2 = " PFX "\n", sc->sc_regs.s2); + LOG(THREAD, LOG_ASYNCH, 1, "\ts3 = " PFX "\n", sc->sc_regs.s3); + LOG(THREAD, LOG_ASYNCH, 1, "\ts4 = " PFX "\n", sc->sc_regs.s4); + LOG(THREAD, LOG_ASYNCH, 1, "\ts5 = " PFX "\n", sc->sc_regs.s5); + LOG(THREAD, LOG_ASYNCH, 1, "\ts6 = " PFX "\n", sc->sc_regs.s6); + LOG(THREAD, LOG_ASYNCH, 1, "\ts7 = " PFX "\n", sc->sc_regs.s7); + LOG(THREAD, LOG_ASYNCH, 1, "\ts8 = " PFX "\n", sc->sc_regs.s8); + LOG(THREAD, LOG_ASYNCH, 1, "\ts9 = " PFX "\n", sc->sc_regs.s9); + LOG(THREAD, LOG_ASYNCH, 1, "\ts10 = " PFX "\n", sc->sc_regs.s10); + LOG(THREAD, LOG_ASYNCH, 1, "\ts11 = " PFX "\n", sc->sc_regs.s11); + LOG(THREAD, LOG_ASYNCH, 1, "\tt3 = " PFX "\n", sc->sc_regs.t3); + LOG(THREAD, LOG_ASYNCH, 1, "\tt4 = " PFX "\n", sc->sc_regs.t4); + LOG(THREAD, LOG_ASYNCH, 1, "\tt5 = " PFX "\n", sc->sc_regs.t5); + LOG(THREAD, LOG_ASYNCH, 1, "\tt6 = " PFX "\n", sc->sc_regs.t6); } #endif /* DEBUG */ diff --git a/suite/tests/CMakeLists.txt b/suite/tests/CMakeLists.txt index f19742abd65..9d63cb47626 100644 --- a/suite/tests/CMakeLists.txt +++ b/suite/tests/CMakeLists.txt @@ -4996,14 +4996,12 @@ if (UNIX) target_link_libraries(linux.signal_pre_syscall rt) endif () - if (NOT RISCV64) # TODO i#3544: Port tests to RISC-V 64 - tobuild(linux.bad-signal-stack linux/bad-signal-stack.c) + tobuild(linux.bad-signal-stack linux/bad-signal-stack.c) - # i#1145: test re-starting syscalls, both inlined and from dispatch - tobuild(linux.eintr linux/eintr.c) - link_with_pthread(linux.eintr) - torunonly(linux.eintr-noinline linux.eintr linux/eintr.c "-no_ignore_syscalls" "") - endif (NOT RISCV64) + # i#1145: test re-starting syscalls, both inlined and from dispatch + tobuild(linux.eintr linux/eintr.c) + link_with_pthread(linux.eintr) + torunonly(linux.eintr-noinline linux.eintr linux/eintr.c "-no_ignore_syscalls" "") if (NOT ANDROID AND NOT RISCV64) # XXX i#1874: get working on Android # TODO i#3544: Port tests to RISC-V 64 diff --git a/suite/tests/api/dis-a64-sve2.txt b/suite/tests/api/dis-a64-sve2.txt index 66e0a7d1d65..00a8e7375b5 100644 --- a/suite/tests/api/dis-a64-sve2.txt +++ b/suite/tests/api/dis-a64-sve2.txt @@ -1600,6 +1600,96 @@ 453da39b : histseg z27.b, z28.b, z29.b : histseg %z28.b %z29.b -> %z27.b 453fa3ff : histseg z31.b, z31.b, z31.b : histseg %z31.b %z31.b -> %z31.b +# LDNT1B { .S }, /Z, [.S{, }] (LDNT1B-Z.P.AR-S.x32.unscaled) +8400a000 : ldnt1b z0.s, p0/Z, [z0.s, x0] : ldnt1b (%z0.s,%x0)[8byte] %p0/z -> %z0.s +8405a482 : ldnt1b z2.s, p1/Z, [z4.s, x5] : ldnt1b (%z4.s,%x5)[8byte] %p1/z -> %z2.s +8407a8c4 : ldnt1b z4.s, p2/Z, [z6.s, x7] : ldnt1b (%z6.s,%x7)[8byte] %p2/z -> %z4.s +8409a906 : ldnt1b z6.s, p2/Z, [z8.s, x9] : ldnt1b (%z8.s,%x9)[8byte] %p2/z -> %z6.s +840bad48 : ldnt1b z8.s, p3/Z, [z10.s, x11] : ldnt1b (%z10.s,%x11)[8byte] %p3/z -> %z8.s +840cad8a : ldnt1b z10.s, p3/Z, [z12.s, x12] : ldnt1b (%z12.s,%x12)[8byte] %p3/z -> %z10.s +840eb1cc : ldnt1b z12.s, p4/Z, [z14.s, x14] : ldnt1b (%z14.s,%x14)[8byte] %p4/z -> %z12.s +8410b20e : ldnt1b z14.s, p4/Z, [z16.s, x16] : ldnt1b (%z16.s,%x16)[8byte] %p4/z -> %z14.s +8412b650 : ldnt1b z16.s, p5/Z, [z18.s, x18] : ldnt1b (%z18.s,%x18)[8byte] %p5/z -> %z16.s +8414b671 : ldnt1b z17.s, p5/Z, [z19.s, x20] : ldnt1b (%z19.s,%x20)[8byte] %p5/z -> %z17.s +8416b6b3 : ldnt1b z19.s, p5/Z, [z21.s, x22] : ldnt1b (%z21.s,%x22)[8byte] %p5/z -> %z19.s +8418baf5 : ldnt1b z21.s, p6/Z, [z23.s, x24] : ldnt1b (%z23.s,%x24)[8byte] %p6/z -> %z21.s +8419bb37 : ldnt1b z23.s, p6/Z, [z25.s, x25] : ldnt1b (%z25.s,%x25)[8byte] %p6/z -> %z23.s +841bbf79 : ldnt1b z25.s, p7/Z, [z27.s, x27] : ldnt1b (%z27.s,%x27)[8byte] %p7/z -> %z25.s +841dbfbb : ldnt1b z27.s, p7/Z, [z29.s, x29] : ldnt1b (%z29.s,%x29)[8byte] %p7/z -> %z27.s +841ebfff : ldnt1b z31.s, p7/Z, [z31.s, x30] : ldnt1b (%z31.s,%x30)[8byte] %p7/z -> %z31.s + +# LDNT1B { .D }, /Z, [.D{, }] (LDNT1B-Z.P.AR-D.64.unscaled) +c400c000 : ldnt1b z0.d, p0/Z, [z0.d, x0] : ldnt1b (%z0.d,%x0)[4byte] %p0/z -> %z0.d +c405c482 : ldnt1b z2.d, p1/Z, [z4.d, x5] : ldnt1b (%z4.d,%x5)[4byte] %p1/z -> %z2.d +c407c8c4 : ldnt1b z4.d, p2/Z, [z6.d, x7] : ldnt1b (%z6.d,%x7)[4byte] %p2/z -> %z4.d +c409c906 : ldnt1b z6.d, p2/Z, [z8.d, x9] : ldnt1b (%z8.d,%x9)[4byte] %p2/z -> %z6.d +c40bcd48 : ldnt1b z8.d, p3/Z, [z10.d, x11] : ldnt1b (%z10.d,%x11)[4byte] %p3/z -> %z8.d +c40ccd8a : ldnt1b z10.d, p3/Z, [z12.d, x12] : ldnt1b (%z12.d,%x12)[4byte] %p3/z -> %z10.d +c40ed1cc : ldnt1b z12.d, p4/Z, [z14.d, x14] : ldnt1b (%z14.d,%x14)[4byte] %p4/z -> %z12.d +c410d20e : ldnt1b z14.d, p4/Z, [z16.d, x16] : ldnt1b (%z16.d,%x16)[4byte] %p4/z -> %z14.d +c412d650 : ldnt1b z16.d, p5/Z, [z18.d, x18] : ldnt1b (%z18.d,%x18)[4byte] %p5/z -> %z16.d +c414d671 : ldnt1b z17.d, p5/Z, [z19.d, x20] : ldnt1b (%z19.d,%x20)[4byte] %p5/z -> %z17.d +c416d6b3 : ldnt1b z19.d, p5/Z, [z21.d, x22] : ldnt1b (%z21.d,%x22)[4byte] %p5/z -> %z19.d +c418daf5 : ldnt1b z21.d, p6/Z, [z23.d, x24] : ldnt1b (%z23.d,%x24)[4byte] %p6/z -> %z21.d +c419db37 : ldnt1b z23.d, p6/Z, [z25.d, x25] : ldnt1b (%z25.d,%x25)[4byte] %p6/z -> %z23.d +c41bdf79 : ldnt1b z25.d, p7/Z, [z27.d, x27] : ldnt1b (%z27.d,%x27)[4byte] %p7/z -> %z25.d +c41ddfbb : ldnt1b z27.d, p7/Z, [z29.d, x29] : ldnt1b (%z29.d,%x29)[4byte] %p7/z -> %z27.d +c41edfff : ldnt1b z31.d, p7/Z, [z31.d, x30] : ldnt1b (%z31.d,%x30)[4byte] %p7/z -> %z31.d + +# LDNT1D { .D }, /Z, [.D{, }] (LDNT1D-Z.P.AR-D.64.unscaled) +c580c000 : ldnt1d z0.d, p0/Z, [z0.d, x0] : ldnt1d (%z0.d,%x0)[32byte] %p0/z -> %z0.d +c585c482 : ldnt1d z2.d, p1/Z, [z4.d, x5] : ldnt1d (%z4.d,%x5)[32byte] %p1/z -> %z2.d +c587c8c4 : ldnt1d z4.d, p2/Z, [z6.d, x7] : ldnt1d (%z6.d,%x7)[32byte] %p2/z -> %z4.d +c589c906 : ldnt1d z6.d, p2/Z, [z8.d, x9] : ldnt1d (%z8.d,%x9)[32byte] %p2/z -> %z6.d +c58bcd48 : ldnt1d z8.d, p3/Z, [z10.d, x11] : ldnt1d (%z10.d,%x11)[32byte] %p3/z -> %z8.d +c58ccd8a : ldnt1d z10.d, p3/Z, [z12.d, x12] : ldnt1d (%z12.d,%x12)[32byte] %p3/z -> %z10.d +c58ed1cc : ldnt1d z12.d, p4/Z, [z14.d, x14] : ldnt1d (%z14.d,%x14)[32byte] %p4/z -> %z12.d +c590d20e : ldnt1d z14.d, p4/Z, [z16.d, x16] : ldnt1d (%z16.d,%x16)[32byte] %p4/z -> %z14.d +c592d650 : ldnt1d z16.d, p5/Z, [z18.d, x18] : ldnt1d (%z18.d,%x18)[32byte] %p5/z -> %z16.d +c594d671 : ldnt1d z17.d, p5/Z, [z19.d, x20] : ldnt1d (%z19.d,%x20)[32byte] %p5/z -> %z17.d +c596d6b3 : ldnt1d z19.d, p5/Z, [z21.d, x22] : ldnt1d (%z21.d,%x22)[32byte] %p5/z -> %z19.d +c598daf5 : ldnt1d z21.d, p6/Z, [z23.d, x24] : ldnt1d (%z23.d,%x24)[32byte] %p6/z -> %z21.d +c599db37 : ldnt1d z23.d, p6/Z, [z25.d, x25] : ldnt1d (%z25.d,%x25)[32byte] %p6/z -> %z23.d +c59bdf79 : ldnt1d z25.d, p7/Z, [z27.d, x27] : ldnt1d (%z27.d,%x27)[32byte] %p7/z -> %z25.d +c59ddfbb : ldnt1d z27.d, p7/Z, [z29.d, x29] : ldnt1d (%z29.d,%x29)[32byte] %p7/z -> %z27.d +c59edfff : ldnt1d z31.d, p7/Z, [z31.d, x30] : ldnt1d (%z31.d,%x30)[32byte] %p7/z -> %z31.d + +# LDNT1H { .S }, /Z, [.S{, }] (LDNT1H-Z.P.AR-S.x32.unscaled) +8480a000 : ldnt1h z0.s, p0/Z, [z0.s, x0] : ldnt1h (%z0.s,%x0)[16byte] %p0/z -> %z0.s +8485a482 : ldnt1h z2.s, p1/Z, [z4.s, x5] : ldnt1h (%z4.s,%x5)[16byte] %p1/z -> %z2.s +8487a8c4 : ldnt1h z4.s, p2/Z, [z6.s, x7] : ldnt1h (%z6.s,%x7)[16byte] %p2/z -> %z4.s +8489a906 : ldnt1h z6.s, p2/Z, [z8.s, x9] : ldnt1h (%z8.s,%x9)[16byte] %p2/z -> %z6.s +848bad48 : ldnt1h z8.s, p3/Z, [z10.s, x11] : ldnt1h (%z10.s,%x11)[16byte] %p3/z -> %z8.s +848cad8a : ldnt1h z10.s, p3/Z, [z12.s, x12] : ldnt1h (%z12.s,%x12)[16byte] %p3/z -> %z10.s +848eb1cc : ldnt1h z12.s, p4/Z, [z14.s, x14] : ldnt1h (%z14.s,%x14)[16byte] %p4/z -> %z12.s +8490b20e : ldnt1h z14.s, p4/Z, [z16.s, x16] : ldnt1h (%z16.s,%x16)[16byte] %p4/z -> %z14.s +8492b650 : ldnt1h z16.s, p5/Z, [z18.s, x18] : ldnt1h (%z18.s,%x18)[16byte] %p5/z -> %z16.s +8494b671 : ldnt1h z17.s, p5/Z, [z19.s, x20] : ldnt1h (%z19.s,%x20)[16byte] %p5/z -> %z17.s +8496b6b3 : ldnt1h z19.s, p5/Z, [z21.s, x22] : ldnt1h (%z21.s,%x22)[16byte] %p5/z -> %z19.s +8498baf5 : ldnt1h z21.s, p6/Z, [z23.s, x24] : ldnt1h (%z23.s,%x24)[16byte] %p6/z -> %z21.s +8499bb37 : ldnt1h z23.s, p6/Z, [z25.s, x25] : ldnt1h (%z25.s,%x25)[16byte] %p6/z -> %z23.s +849bbf79 : ldnt1h z25.s, p7/Z, [z27.s, x27] : ldnt1h (%z27.s,%x27)[16byte] %p7/z -> %z25.s +849dbfbb : ldnt1h z27.s, p7/Z, [z29.s, x29] : ldnt1h (%z29.s,%x29)[16byte] %p7/z -> %z27.s +849ebfff : ldnt1h z31.s, p7/Z, [z31.s, x30] : ldnt1h (%z31.s,%x30)[16byte] %p7/z -> %z31.s + +# LDNT1H { .D }, /Z, [.D{, }] (LDNT1H-Z.P.AR-D.64.unscaled) +c480c000 : ldnt1h z0.d, p0/Z, [z0.d, x0] : ldnt1h (%z0.d,%x0)[8byte] %p0/z -> %z0.d +c485c482 : ldnt1h z2.d, p1/Z, [z4.d, x5] : ldnt1h (%z4.d,%x5)[8byte] %p1/z -> %z2.d +c487c8c4 : ldnt1h z4.d, p2/Z, [z6.d, x7] : ldnt1h (%z6.d,%x7)[8byte] %p2/z -> %z4.d +c489c906 : ldnt1h z6.d, p2/Z, [z8.d, x9] : ldnt1h (%z8.d,%x9)[8byte] %p2/z -> %z6.d +c48bcd48 : ldnt1h z8.d, p3/Z, [z10.d, x11] : ldnt1h (%z10.d,%x11)[8byte] %p3/z -> %z8.d +c48ccd8a : ldnt1h z10.d, p3/Z, [z12.d, x12] : ldnt1h (%z12.d,%x12)[8byte] %p3/z -> %z10.d +c48ed1cc : ldnt1h z12.d, p4/Z, [z14.d, x14] : ldnt1h (%z14.d,%x14)[8byte] %p4/z -> %z12.d +c490d20e : ldnt1h z14.d, p4/Z, [z16.d, x16] : ldnt1h (%z16.d,%x16)[8byte] %p4/z -> %z14.d +c492d650 : ldnt1h z16.d, p5/Z, [z18.d, x18] : ldnt1h (%z18.d,%x18)[8byte] %p5/z -> %z16.d +c494d671 : ldnt1h z17.d, p5/Z, [z19.d, x20] : ldnt1h (%z19.d,%x20)[8byte] %p5/z -> %z17.d +c496d6b3 : ldnt1h z19.d, p5/Z, [z21.d, x22] : ldnt1h (%z21.d,%x22)[8byte] %p5/z -> %z19.d +c498daf5 : ldnt1h z21.d, p6/Z, [z23.d, x24] : ldnt1h (%z23.d,%x24)[8byte] %p6/z -> %z21.d +c499db37 : ldnt1h z23.d, p6/Z, [z25.d, x25] : ldnt1h (%z25.d,%x25)[8byte] %p6/z -> %z23.d +c49bdf79 : ldnt1h z25.d, p7/Z, [z27.d, x27] : ldnt1h (%z27.d,%x27)[8byte] %p7/z -> %z25.d +c49ddfbb : ldnt1h z27.d, p7/Z, [z29.d, x29] : ldnt1h (%z29.d,%x29)[8byte] %p7/z -> %z27.d +c49edfff : ldnt1h z31.d, p7/Z, [z31.d, x30] : ldnt1h (%z31.d,%x30)[8byte] %p7/z -> %z31.d + # LDNT1SB { .S }, /Z, [.S{, }] (LDNT1SB-Z.P.AR-S.x32.unscaled) 84008000 : ldnt1sb z0.s, p0/Z, [z0.s, x0] : ldnt1sb (%z0.s,%x0)[8byte] %p0/z -> %z0.s 84058482 : ldnt1sb z2.s, p1/Z, [z4.s, x5] : ldnt1sb (%z4.s,%x5)[8byte] %p1/z -> %z2.s @@ -1690,6 +1780,42 @@ c51b9f79 : ldnt1sw z25.d, p7/Z, [z27.d, x27] : ldnt1sw (%z27.d,%x27)[16b c51d9fbb : ldnt1sw z27.d, p7/Z, [z29.d, x29] : ldnt1sw (%z29.d,%x29)[16byte] %p7/z -> %z27.d c51e9fff : ldnt1sw z31.d, p7/Z, [z31.d, x30] : ldnt1sw (%z31.d,%x30)[16byte] %p7/z -> %z31.d +# LDNT1W { .S }, /Z, [.S{, }] (LDNT1W-Z.P.AR-S.x32.unscaled) +8500a000 : ldnt1w z0.s, p0/Z, [z0.s, x0] : ldnt1w (%z0.s,%x0)[32byte] %p0/z -> %z0.s +8505a482 : ldnt1w z2.s, p1/Z, [z4.s, x5] : ldnt1w (%z4.s,%x5)[32byte] %p1/z -> %z2.s +8507a8c4 : ldnt1w z4.s, p2/Z, [z6.s, x7] : ldnt1w (%z6.s,%x7)[32byte] %p2/z -> %z4.s +8509a906 : ldnt1w z6.s, p2/Z, [z8.s, x9] : ldnt1w (%z8.s,%x9)[32byte] %p2/z -> %z6.s +850bad48 : ldnt1w z8.s, p3/Z, [z10.s, x11] : ldnt1w (%z10.s,%x11)[32byte] %p3/z -> %z8.s +850cad8a : ldnt1w z10.s, p3/Z, [z12.s, x12] : ldnt1w (%z12.s,%x12)[32byte] %p3/z -> %z10.s +850eb1cc : ldnt1w z12.s, p4/Z, [z14.s, x14] : ldnt1w (%z14.s,%x14)[32byte] %p4/z -> %z12.s +8510b20e : ldnt1w z14.s, p4/Z, [z16.s, x16] : ldnt1w (%z16.s,%x16)[32byte] %p4/z -> %z14.s +8512b650 : ldnt1w z16.s, p5/Z, [z18.s, x18] : ldnt1w (%z18.s,%x18)[32byte] %p5/z -> %z16.s +8514b671 : ldnt1w z17.s, p5/Z, [z19.s, x20] : ldnt1w (%z19.s,%x20)[32byte] %p5/z -> %z17.s +8516b6b3 : ldnt1w z19.s, p5/Z, [z21.s, x22] : ldnt1w (%z21.s,%x22)[32byte] %p5/z -> %z19.s +8518baf5 : ldnt1w z21.s, p6/Z, [z23.s, x24] : ldnt1w (%z23.s,%x24)[32byte] %p6/z -> %z21.s +8519bb37 : ldnt1w z23.s, p6/Z, [z25.s, x25] : ldnt1w (%z25.s,%x25)[32byte] %p6/z -> %z23.s +851bbf79 : ldnt1w z25.s, p7/Z, [z27.s, x27] : ldnt1w (%z27.s,%x27)[32byte] %p7/z -> %z25.s +851dbfbb : ldnt1w z27.s, p7/Z, [z29.s, x29] : ldnt1w (%z29.s,%x29)[32byte] %p7/z -> %z27.s +851ebfff : ldnt1w z31.s, p7/Z, [z31.s, x30] : ldnt1w (%z31.s,%x30)[32byte] %p7/z -> %z31.s + +# LDNT1W { .D }, /Z, [.D{, }] (LDNT1W-Z.P.AR-D.64.unscaled) +c500c000 : ldnt1w z0.d, p0/Z, [z0.d, x0] : ldnt1w (%z0.d,%x0)[16byte] %p0/z -> %z0.d +c505c482 : ldnt1w z2.d, p1/Z, [z4.d, x5] : ldnt1w (%z4.d,%x5)[16byte] %p1/z -> %z2.d +c507c8c4 : ldnt1w z4.d, p2/Z, [z6.d, x7] : ldnt1w (%z6.d,%x7)[16byte] %p2/z -> %z4.d +c509c906 : ldnt1w z6.d, p2/Z, [z8.d, x9] : ldnt1w (%z8.d,%x9)[16byte] %p2/z -> %z6.d +c50bcd48 : ldnt1w z8.d, p3/Z, [z10.d, x11] : ldnt1w (%z10.d,%x11)[16byte] %p3/z -> %z8.d +c50ccd8a : ldnt1w z10.d, p3/Z, [z12.d, x12] : ldnt1w (%z12.d,%x12)[16byte] %p3/z -> %z10.d +c50ed1cc : ldnt1w z12.d, p4/Z, [z14.d, x14] : ldnt1w (%z14.d,%x14)[16byte] %p4/z -> %z12.d +c510d20e : ldnt1w z14.d, p4/Z, [z16.d, x16] : ldnt1w (%z16.d,%x16)[16byte] %p4/z -> %z14.d +c512d650 : ldnt1w z16.d, p5/Z, [z18.d, x18] : ldnt1w (%z18.d,%x18)[16byte] %p5/z -> %z16.d +c514d671 : ldnt1w z17.d, p5/Z, [z19.d, x20] : ldnt1w (%z19.d,%x20)[16byte] %p5/z -> %z17.d +c516d6b3 : ldnt1w z19.d, p5/Z, [z21.d, x22] : ldnt1w (%z21.d,%x22)[16byte] %p5/z -> %z19.d +c518daf5 : ldnt1w z21.d, p6/Z, [z23.d, x24] : ldnt1w (%z23.d,%x24)[16byte] %p6/z -> %z21.d +c519db37 : ldnt1w z23.d, p6/Z, [z25.d, x25] : ldnt1w (%z25.d,%x25)[16byte] %p6/z -> %z23.d +c51bdf79 : ldnt1w z25.d, p7/Z, [z27.d, x27] : ldnt1w (%z27.d,%x27)[16byte] %p7/z -> %z25.d +c51ddfbb : ldnt1w z27.d, p7/Z, [z29.d, x29] : ldnt1w (%z29.d,%x29)[16byte] %p7/z -> %z27.d +c51edfff : ldnt1w z31.d, p7/Z, [z31.d, x30] : ldnt1w (%z31.d,%x30)[16byte] %p7/z -> %z31.d + # MATCH ., /Z, ., . (MATCH-P.P.ZZ-_) 45208000 : match p0.b, p0/Z, z0.b, z0.b : match %p0/z %z0.b %z0.b -> %p0.b 45258481 : match p1.b, p1/Z, z4.b, z5.b : match %p1/z %z4.b %z5.b -> %p1.b @@ -7182,6 +7308,132 @@ c51e9fff : ldnt1sw z31.d, p7/Z, [z31.d, x30] : ldnt1sw (%z31.d,%x30)[16b 45dd579b : ssubwt z27.d, z28.d, z29.s : ssubwt %z28.d %z29.s -> %z27.d 45df57ff : ssubwt z31.d, z31.d, z31.s : ssubwt %z31.d %z31.s -> %z31.d +# STNT1B { .D }, , [.D{, }] (STNT1B-Z.P.AR-D.64.unscaled) +e4002000 : stnt1b z0.d, p0, [z0.d, x0] : stnt1b %z0.d %p0 -> (%z0.d,%x0)[4byte] +e4052482 : stnt1b z2.d, p1, [z4.d, x5] : stnt1b %z2.d %p1 -> (%z4.d,%x5)[4byte] +e40728c4 : stnt1b z4.d, p2, [z6.d, x7] : stnt1b %z4.d %p2 -> (%z6.d,%x7)[4byte] +e4092906 : stnt1b z6.d, p2, [z8.d, x9] : stnt1b %z6.d %p2 -> (%z8.d,%x9)[4byte] +e40b2d48 : stnt1b z8.d, p3, [z10.d, x11] : stnt1b %z8.d %p3 -> (%z10.d,%x11)[4byte] +e40c2d8a : stnt1b z10.d, p3, [z12.d, x12] : stnt1b %z10.d %p3 -> (%z12.d,%x12)[4byte] +e40e31cc : stnt1b z12.d, p4, [z14.d, x14] : stnt1b %z12.d %p4 -> (%z14.d,%x14)[4byte] +e410320e : stnt1b z14.d, p4, [z16.d, x16] : stnt1b %z14.d %p4 -> (%z16.d,%x16)[4byte] +e4123650 : stnt1b z16.d, p5, [z18.d, x18] : stnt1b %z16.d %p5 -> (%z18.d,%x18)[4byte] +e4143671 : stnt1b z17.d, p5, [z19.d, x20] : stnt1b %z17.d %p5 -> (%z19.d,%x20)[4byte] +e41636b3 : stnt1b z19.d, p5, [z21.d, x22] : stnt1b %z19.d %p5 -> (%z21.d,%x22)[4byte] +e4183af5 : stnt1b z21.d, p6, [z23.d, x24] : stnt1b %z21.d %p6 -> (%z23.d,%x24)[4byte] +e4193b37 : stnt1b z23.d, p6, [z25.d, x25] : stnt1b %z23.d %p6 -> (%z25.d,%x25)[4byte] +e41b3f79 : stnt1b z25.d, p7, [z27.d, x27] : stnt1b %z25.d %p7 -> (%z27.d,%x27)[4byte] +e41d3fbb : stnt1b z27.d, p7, [z29.d, x29] : stnt1b %z27.d %p7 -> (%z29.d,%x29)[4byte] +e41e3fff : stnt1b z31.d, p7, [z31.d, x30] : stnt1b %z31.d %p7 -> (%z31.d,%x30)[4byte] + +# STNT1B { .S }, , [.S{, }] (STNT1B-Z.P.AR-S.x32.unscaled) +e4402000 : stnt1b z0.s, p0, [z0.s, x0] : stnt1b %z0.s %p0 -> (%z0.s,%x0)[8byte] +e4452482 : stnt1b z2.s, p1, [z4.s, x5] : stnt1b %z2.s %p1 -> (%z4.s,%x5)[8byte] +e44728c4 : stnt1b z4.s, p2, [z6.s, x7] : stnt1b %z4.s %p2 -> (%z6.s,%x7)[8byte] +e4492906 : stnt1b z6.s, p2, [z8.s, x9] : stnt1b %z6.s %p2 -> (%z8.s,%x9)[8byte] +e44b2d48 : stnt1b z8.s, p3, [z10.s, x11] : stnt1b %z8.s %p3 -> (%z10.s,%x11)[8byte] +e44c2d8a : stnt1b z10.s, p3, [z12.s, x12] : stnt1b %z10.s %p3 -> (%z12.s,%x12)[8byte] +e44e31cc : stnt1b z12.s, p4, [z14.s, x14] : stnt1b %z12.s %p4 -> (%z14.s,%x14)[8byte] +e450320e : stnt1b z14.s, p4, [z16.s, x16] : stnt1b %z14.s %p4 -> (%z16.s,%x16)[8byte] +e4523650 : stnt1b z16.s, p5, [z18.s, x18] : stnt1b %z16.s %p5 -> (%z18.s,%x18)[8byte] +e4543671 : stnt1b z17.s, p5, [z19.s, x20] : stnt1b %z17.s %p5 -> (%z19.s,%x20)[8byte] +e45636b3 : stnt1b z19.s, p5, [z21.s, x22] : stnt1b %z19.s %p5 -> (%z21.s,%x22)[8byte] +e4583af5 : stnt1b z21.s, p6, [z23.s, x24] : stnt1b %z21.s %p6 -> (%z23.s,%x24)[8byte] +e4593b37 : stnt1b z23.s, p6, [z25.s, x25] : stnt1b %z23.s %p6 -> (%z25.s,%x25)[8byte] +e45b3f79 : stnt1b z25.s, p7, [z27.s, x27] : stnt1b %z25.s %p7 -> (%z27.s,%x27)[8byte] +e45d3fbb : stnt1b z27.s, p7, [z29.s, x29] : stnt1b %z27.s %p7 -> (%z29.s,%x29)[8byte] +e45e3fff : stnt1b z31.s, p7, [z31.s, x30] : stnt1b %z31.s %p7 -> (%z31.s,%x30)[8byte] + +# STNT1D { .D }, , [.D{, }] (STNT1D-Z.P.AR-D.64.unscaled) +e5802000 : stnt1d z0.d, p0, [z0.d, x0] : stnt1d %z0.d %p0 -> (%z0.d,%x0)[32byte] +e5852482 : stnt1d z2.d, p1, [z4.d, x5] : stnt1d %z2.d %p1 -> (%z4.d,%x5)[32byte] +e58728c4 : stnt1d z4.d, p2, [z6.d, x7] : stnt1d %z4.d %p2 -> (%z6.d,%x7)[32byte] +e5892906 : stnt1d z6.d, p2, [z8.d, x9] : stnt1d %z6.d %p2 -> (%z8.d,%x9)[32byte] +e58b2d48 : stnt1d z8.d, p3, [z10.d, x11] : stnt1d %z8.d %p3 -> (%z10.d,%x11)[32byte] +e58c2d8a : stnt1d z10.d, p3, [z12.d, x12] : stnt1d %z10.d %p3 -> (%z12.d,%x12)[32byte] +e58e31cc : stnt1d z12.d, p4, [z14.d, x14] : stnt1d %z12.d %p4 -> (%z14.d,%x14)[32byte] +e590320e : stnt1d z14.d, p4, [z16.d, x16] : stnt1d %z14.d %p4 -> (%z16.d,%x16)[32byte] +e5923650 : stnt1d z16.d, p5, [z18.d, x18] : stnt1d %z16.d %p5 -> (%z18.d,%x18)[32byte] +e5943671 : stnt1d z17.d, p5, [z19.d, x20] : stnt1d %z17.d %p5 -> (%z19.d,%x20)[32byte] +e59636b3 : stnt1d z19.d, p5, [z21.d, x22] : stnt1d %z19.d %p5 -> (%z21.d,%x22)[32byte] +e5983af5 : stnt1d z21.d, p6, [z23.d, x24] : stnt1d %z21.d %p6 -> (%z23.d,%x24)[32byte] +e5993b37 : stnt1d z23.d, p6, [z25.d, x25] : stnt1d %z23.d %p6 -> (%z25.d,%x25)[32byte] +e59b3f79 : stnt1d z25.d, p7, [z27.d, x27] : stnt1d %z25.d %p7 -> (%z27.d,%x27)[32byte] +e59d3fbb : stnt1d z27.d, p7, [z29.d, x29] : stnt1d %z27.d %p7 -> (%z29.d,%x29)[32byte] +e59e3fff : stnt1d z31.d, p7, [z31.d, x30] : stnt1d %z31.d %p7 -> (%z31.d,%x30)[32byte] + +# STNT1H { .D }, , [.D{, }] (STNT1H-Z.P.AR-D.64.unscaled) +e4802000 : stnt1h z0.d, p0, [z0.d, x0] : stnt1h %z0.d %p0 -> (%z0.d,%x0)[8byte] +e4852482 : stnt1h z2.d, p1, [z4.d, x5] : stnt1h %z2.d %p1 -> (%z4.d,%x5)[8byte] +e48728c4 : stnt1h z4.d, p2, [z6.d, x7] : stnt1h %z4.d %p2 -> (%z6.d,%x7)[8byte] +e4892906 : stnt1h z6.d, p2, [z8.d, x9] : stnt1h %z6.d %p2 -> (%z8.d,%x9)[8byte] +e48b2d48 : stnt1h z8.d, p3, [z10.d, x11] : stnt1h %z8.d %p3 -> (%z10.d,%x11)[8byte] +e48c2d8a : stnt1h z10.d, p3, [z12.d, x12] : stnt1h %z10.d %p3 -> (%z12.d,%x12)[8byte] +e48e31cc : stnt1h z12.d, p4, [z14.d, x14] : stnt1h %z12.d %p4 -> (%z14.d,%x14)[8byte] +e490320e : stnt1h z14.d, p4, [z16.d, x16] : stnt1h %z14.d %p4 -> (%z16.d,%x16)[8byte] +e4923650 : stnt1h z16.d, p5, [z18.d, x18] : stnt1h %z16.d %p5 -> (%z18.d,%x18)[8byte] +e4943671 : stnt1h z17.d, p5, [z19.d, x20] : stnt1h %z17.d %p5 -> (%z19.d,%x20)[8byte] +e49636b3 : stnt1h z19.d, p5, [z21.d, x22] : stnt1h %z19.d %p5 -> (%z21.d,%x22)[8byte] +e4983af5 : stnt1h z21.d, p6, [z23.d, x24] : stnt1h %z21.d %p6 -> (%z23.d,%x24)[8byte] +e4993b37 : stnt1h z23.d, p6, [z25.d, x25] : stnt1h %z23.d %p6 -> (%z25.d,%x25)[8byte] +e49b3f79 : stnt1h z25.d, p7, [z27.d, x27] : stnt1h %z25.d %p7 -> (%z27.d,%x27)[8byte] +e49d3fbb : stnt1h z27.d, p7, [z29.d, x29] : stnt1h %z27.d %p7 -> (%z29.d,%x29)[8byte] +e49e3fff : stnt1h z31.d, p7, [z31.d, x30] : stnt1h %z31.d %p7 -> (%z31.d,%x30)[8byte] + +# STNT1H { .S }, , [.S{, }] (STNT1H-Z.P.AR-S.x32.unscaled) +e4c02000 : stnt1h z0.s, p0, [z0.s, x0] : stnt1h %z0.s %p0 -> (%z0.s,%x0)[16byte] +e4c52482 : stnt1h z2.s, p1, [z4.s, x5] : stnt1h %z2.s %p1 -> (%z4.s,%x5)[16byte] +e4c728c4 : stnt1h z4.s, p2, [z6.s, x7] : stnt1h %z4.s %p2 -> (%z6.s,%x7)[16byte] +e4c92906 : stnt1h z6.s, p2, [z8.s, x9] : stnt1h %z6.s %p2 -> (%z8.s,%x9)[16byte] +e4cb2d48 : stnt1h z8.s, p3, [z10.s, x11] : stnt1h %z8.s %p3 -> (%z10.s,%x11)[16byte] +e4cc2d8a : stnt1h z10.s, p3, [z12.s, x12] : stnt1h %z10.s %p3 -> (%z12.s,%x12)[16byte] +e4ce31cc : stnt1h z12.s, p4, [z14.s, x14] : stnt1h %z12.s %p4 -> (%z14.s,%x14)[16byte] +e4d0320e : stnt1h z14.s, p4, [z16.s, x16] : stnt1h %z14.s %p4 -> (%z16.s,%x16)[16byte] +e4d23650 : stnt1h z16.s, p5, [z18.s, x18] : stnt1h %z16.s %p5 -> (%z18.s,%x18)[16byte] +e4d43671 : stnt1h z17.s, p5, [z19.s, x20] : stnt1h %z17.s %p5 -> (%z19.s,%x20)[16byte] +e4d636b3 : stnt1h z19.s, p5, [z21.s, x22] : stnt1h %z19.s %p5 -> (%z21.s,%x22)[16byte] +e4d83af5 : stnt1h z21.s, p6, [z23.s, x24] : stnt1h %z21.s %p6 -> (%z23.s,%x24)[16byte] +e4d93b37 : stnt1h z23.s, p6, [z25.s, x25] : stnt1h %z23.s %p6 -> (%z25.s,%x25)[16byte] +e4db3f79 : stnt1h z25.s, p7, [z27.s, x27] : stnt1h %z25.s %p7 -> (%z27.s,%x27)[16byte] +e4dd3fbb : stnt1h z27.s, p7, [z29.s, x29] : stnt1h %z27.s %p7 -> (%z29.s,%x29)[16byte] +e4de3fff : stnt1h z31.s, p7, [z31.s, x30] : stnt1h %z31.s %p7 -> (%z31.s,%x30)[16byte] + +# STNT1W { .D }, , [.D{, }] (STNT1W-Z.P.AR-D.64.unscaled) +e5002000 : stnt1w z0.d, p0, [z0.d, x0] : stnt1w %z0.d %p0 -> (%z0.d,%x0)[16byte] +e5052482 : stnt1w z2.d, p1, [z4.d, x5] : stnt1w %z2.d %p1 -> (%z4.d,%x5)[16byte] +e50728c4 : stnt1w z4.d, p2, [z6.d, x7] : stnt1w %z4.d %p2 -> (%z6.d,%x7)[16byte] +e5092906 : stnt1w z6.d, p2, [z8.d, x9] : stnt1w %z6.d %p2 -> (%z8.d,%x9)[16byte] +e50b2d48 : stnt1w z8.d, p3, [z10.d, x11] : stnt1w %z8.d %p3 -> (%z10.d,%x11)[16byte] +e50c2d8a : stnt1w z10.d, p3, [z12.d, x12] : stnt1w %z10.d %p3 -> (%z12.d,%x12)[16byte] +e50e31cc : stnt1w z12.d, p4, [z14.d, x14] : stnt1w %z12.d %p4 -> (%z14.d,%x14)[16byte] +e510320e : stnt1w z14.d, p4, [z16.d, x16] : stnt1w %z14.d %p4 -> (%z16.d,%x16)[16byte] +e5123650 : stnt1w z16.d, p5, [z18.d, x18] : stnt1w %z16.d %p5 -> (%z18.d,%x18)[16byte] +e5143671 : stnt1w z17.d, p5, [z19.d, x20] : stnt1w %z17.d %p5 -> (%z19.d,%x20)[16byte] +e51636b3 : stnt1w z19.d, p5, [z21.d, x22] : stnt1w %z19.d %p5 -> (%z21.d,%x22)[16byte] +e5183af5 : stnt1w z21.d, p6, [z23.d, x24] : stnt1w %z21.d %p6 -> (%z23.d,%x24)[16byte] +e5193b37 : stnt1w z23.d, p6, [z25.d, x25] : stnt1w %z23.d %p6 -> (%z25.d,%x25)[16byte] +e51b3f79 : stnt1w z25.d, p7, [z27.d, x27] : stnt1w %z25.d %p7 -> (%z27.d,%x27)[16byte] +e51d3fbb : stnt1w z27.d, p7, [z29.d, x29] : stnt1w %z27.d %p7 -> (%z29.d,%x29)[16byte] +e51e3fff : stnt1w z31.d, p7, [z31.d, x30] : stnt1w %z31.d %p7 -> (%z31.d,%x30)[16byte] + +# STNT1W { .S }, , [.S{, }] (STNT1W-Z.P.AR-S.x32.unscaled) +e5402000 : stnt1w z0.s, p0, [z0.s, x0] : stnt1w %z0.s %p0 -> (%z0.s,%x0)[32byte] +e5452482 : stnt1w z2.s, p1, [z4.s, x5] : stnt1w %z2.s %p1 -> (%z4.s,%x5)[32byte] +e54728c4 : stnt1w z4.s, p2, [z6.s, x7] : stnt1w %z4.s %p2 -> (%z6.s,%x7)[32byte] +e5492906 : stnt1w z6.s, p2, [z8.s, x9] : stnt1w %z6.s %p2 -> (%z8.s,%x9)[32byte] +e54b2d48 : stnt1w z8.s, p3, [z10.s, x11] : stnt1w %z8.s %p3 -> (%z10.s,%x11)[32byte] +e54c2d8a : stnt1w z10.s, p3, [z12.s, x12] : stnt1w %z10.s %p3 -> (%z12.s,%x12)[32byte] +e54e31cc : stnt1w z12.s, p4, [z14.s, x14] : stnt1w %z12.s %p4 -> (%z14.s,%x14)[32byte] +e550320e : stnt1w z14.s, p4, [z16.s, x16] : stnt1w %z14.s %p4 -> (%z16.s,%x16)[32byte] +e5523650 : stnt1w z16.s, p5, [z18.s, x18] : stnt1w %z16.s %p5 -> (%z18.s,%x18)[32byte] +e5543671 : stnt1w z17.s, p5, [z19.s, x20] : stnt1w %z17.s %p5 -> (%z19.s,%x20)[32byte] +e55636b3 : stnt1w z19.s, p5, [z21.s, x22] : stnt1w %z19.s %p5 -> (%z21.s,%x22)[32byte] +e5583af5 : stnt1w z21.s, p6, [z23.s, x24] : stnt1w %z21.s %p6 -> (%z23.s,%x24)[32byte] +e5593b37 : stnt1w z23.s, p6, [z25.s, x25] : stnt1w %z23.s %p6 -> (%z25.s,%x25)[32byte] +e55b3f79 : stnt1w z25.s, p7, [z27.s, x27] : stnt1w %z25.s %p7 -> (%z27.s,%x27)[32byte] +e55d3fbb : stnt1w z27.s, p7, [z29.s, x29] : stnt1w %z27.s %p7 -> (%z29.s,%x29)[32byte] +e55e3fff : stnt1w z31.s, p7, [z31.s, x30] : stnt1w %z31.s %p7 -> (%z31.s,%x30)[32byte] + # SUBHNB ., ., . (SUBHNB-Z.ZZ-_) 45607000 : subhnb z0.b, z0.h, z0.h : subhnb %z0.h %z0.h -> %z0.b 45647062 : subhnb z2.b, z3.h, z4.h : subhnb %z3.h %z4.h -> %z2.b diff --git a/suite/tests/api/ir_aarch64_sve2.c b/suite/tests/api/ir_aarch64_sve2.c index 56c9810de66..025683de45c 100644 --- a/suite/tests/api/ir_aarch64_sve2.c +++ b/suite/tests/api/ir_aarch64_sve2.c @@ -8359,6 +8359,262 @@ TEST_INSTR(whilewr_sve) opnd_create_reg_element_vector(Pn_six_offset_0[i], OPSZ_8), opnd_create_reg(Xn_six_offset_1[i]), opnd_create_reg(Xn_six_offset_2[i])); } + +TEST_INSTR(ldnt1b_sve_pred) +{ + + /* Testing LDNT1B { .D }, /Z, [.D{, }] */ + const char *const expected_0_0[6] = { + "ldnt1b (%z0.d,%x0)[4byte] %p0/z -> %z0.d", + "ldnt1b (%z7.d,%x8)[4byte] %p2/z -> %z5.d", + "ldnt1b (%z12.d,%x13)[4byte] %p3/z -> %z10.d", + "ldnt1b (%z18.d,%x18)[4byte] %p5/z -> %z16.d", + "ldnt1b (%z23.d,%x23)[4byte] %p6/z -> %z21.d", + "ldnt1b (%z31.d,%x30)[4byte] %p7/z -> %z31.d", + }; + TEST_LOOP(ldnt1b, ldnt1b_sve_pred, 6, expected_0_0[i], + opnd_create_reg_element_vector(Zn_six_offset_0[i], OPSZ_8), + opnd_create_predicate_reg(Pn_half_six_offset_0[i], false), + opnd_create_vector_base_disp_aarch64(Zn_six_offset_2[i], Xn_six_offset_3[i], + OPSZ_8, DR_EXTEND_UXTX, 0, 0, 0, + OPSZ_4, 0)); + + /* Testing LDNT1B { .S }, /Z, [.S{, }] */ + const char *const expected_1_0[6] = { + "ldnt1b (%z0.s,%x0)[8byte] %p0/z -> %z0.s", + "ldnt1b (%z7.s,%x8)[8byte] %p2/z -> %z5.s", + "ldnt1b (%z12.s,%x13)[8byte] %p3/z -> %z10.s", + "ldnt1b (%z18.s,%x18)[8byte] %p5/z -> %z16.s", + "ldnt1b (%z23.s,%x23)[8byte] %p6/z -> %z21.s", + "ldnt1b (%z31.s,%x30)[8byte] %p7/z -> %z31.s", + }; + TEST_LOOP(ldnt1b, ldnt1b_sve_pred, 6, expected_1_0[i], + opnd_create_reg_element_vector(Zn_six_offset_0[i], OPSZ_4), + opnd_create_predicate_reg(Pn_half_six_offset_0[i], false), + opnd_create_vector_base_disp_aarch64(Zn_six_offset_2[i], Xn_six_offset_3[i], + OPSZ_4, DR_EXTEND_UXTX, 0, 0, 0, + OPSZ_8, 0)); +} + +TEST_INSTR(ldnt1d_sve_pred) +{ + + /* Testing LDNT1D { .D }, /Z, [.D{, }] */ + const char *const expected_0_0[6] = { + "ldnt1d (%z0.d,%x0)[32byte] %p0/z -> %z0.d", + "ldnt1d (%z7.d,%x8)[32byte] %p2/z -> %z5.d", + "ldnt1d (%z12.d,%x13)[32byte] %p3/z -> %z10.d", + "ldnt1d (%z18.d,%x18)[32byte] %p5/z -> %z16.d", + "ldnt1d (%z23.d,%x23)[32byte] %p6/z -> %z21.d", + "ldnt1d (%z31.d,%x30)[32byte] %p7/z -> %z31.d", + }; + TEST_LOOP(ldnt1d, ldnt1d_sve_pred, 6, expected_0_0[i], + opnd_create_reg_element_vector(Zn_six_offset_0[i], OPSZ_8), + opnd_create_predicate_reg(Pn_half_six_offset_0[i], false), + opnd_create_vector_base_disp_aarch64(Zn_six_offset_2[i], Xn_six_offset_3[i], + OPSZ_8, DR_EXTEND_UXTX, 0, 0, 0, + OPSZ_32, 0)); +} + +TEST_INSTR(ldnt1h_sve_pred) +{ + + /* Testing LDNT1H { .D }, /Z, [.D{, }] */ + const char *const expected_0_0[6] = { + "ldnt1h (%z0.d,%x0)[8byte] %p0/z -> %z0.d", + "ldnt1h (%z7.d,%x8)[8byte] %p2/z -> %z5.d", + "ldnt1h (%z12.d,%x13)[8byte] %p3/z -> %z10.d", + "ldnt1h (%z18.d,%x18)[8byte] %p5/z -> %z16.d", + "ldnt1h (%z23.d,%x23)[8byte] %p6/z -> %z21.d", + "ldnt1h (%z31.d,%x30)[8byte] %p7/z -> %z31.d", + }; + TEST_LOOP(ldnt1h, ldnt1h_sve_pred, 6, expected_0_0[i], + opnd_create_reg_element_vector(Zn_six_offset_0[i], OPSZ_8), + opnd_create_predicate_reg(Pn_half_six_offset_0[i], false), + opnd_create_vector_base_disp_aarch64(Zn_six_offset_2[i], Xn_six_offset_3[i], + OPSZ_8, DR_EXTEND_UXTX, 0, 0, 0, + OPSZ_8, 0)); + + /* Testing LDNT1H { .S }, /Z, [.S{, }] */ + const char *const expected_1_0[6] = { + "ldnt1h (%z0.s,%x0)[16byte] %p0/z -> %z0.s", + "ldnt1h (%z7.s,%x8)[16byte] %p2/z -> %z5.s", + "ldnt1h (%z12.s,%x13)[16byte] %p3/z -> %z10.s", + "ldnt1h (%z18.s,%x18)[16byte] %p5/z -> %z16.s", + "ldnt1h (%z23.s,%x23)[16byte] %p6/z -> %z21.s", + "ldnt1h (%z31.s,%x30)[16byte] %p7/z -> %z31.s", + }; + TEST_LOOP(ldnt1h, ldnt1h_sve_pred, 6, expected_1_0[i], + opnd_create_reg_element_vector(Zn_six_offset_0[i], OPSZ_4), + opnd_create_predicate_reg(Pn_half_six_offset_0[i], false), + opnd_create_vector_base_disp_aarch64(Zn_six_offset_2[i], Xn_six_offset_3[i], + OPSZ_4, DR_EXTEND_UXTX, 0, 0, 0, + OPSZ_16, 0)); +} + +TEST_INSTR(ldnt1w_sve_pred) +{ + + /* Testing LDNT1W { .D }, /Z, [.D{, }] */ + const char *const expected_0_0[6] = { + "ldnt1w (%z0.d,%x0)[16byte] %p0/z -> %z0.d", + "ldnt1w (%z7.d,%x8)[16byte] %p2/z -> %z5.d", + "ldnt1w (%z12.d,%x13)[16byte] %p3/z -> %z10.d", + "ldnt1w (%z18.d,%x18)[16byte] %p5/z -> %z16.d", + "ldnt1w (%z23.d,%x23)[16byte] %p6/z -> %z21.d", + "ldnt1w (%z31.d,%x30)[16byte] %p7/z -> %z31.d", + }; + TEST_LOOP(ldnt1w, ldnt1w_sve_pred, 6, expected_0_0[i], + opnd_create_reg_element_vector(Zn_six_offset_0[i], OPSZ_8), + opnd_create_predicate_reg(Pn_half_six_offset_0[i], false), + opnd_create_vector_base_disp_aarch64(Zn_six_offset_2[i], Xn_six_offset_3[i], + OPSZ_8, DR_EXTEND_UXTX, 0, 0, 0, + OPSZ_16, 0)); + + /* Testing LDNT1W { .S }, /Z, [.S{, }] */ + const char *const expected_1_0[6] = { + "ldnt1w (%z0.s,%x0)[32byte] %p0/z -> %z0.s", + "ldnt1w (%z7.s,%x8)[32byte] %p2/z -> %z5.s", + "ldnt1w (%z12.s,%x13)[32byte] %p3/z -> %z10.s", + "ldnt1w (%z18.s,%x18)[32byte] %p5/z -> %z16.s", + "ldnt1w (%z23.s,%x23)[32byte] %p6/z -> %z21.s", + "ldnt1w (%z31.s,%x30)[32byte] %p7/z -> %z31.s", + }; + TEST_LOOP(ldnt1w, ldnt1w_sve_pred, 6, expected_1_0[i], + opnd_create_reg_element_vector(Zn_six_offset_0[i], OPSZ_4), + opnd_create_predicate_reg(Pn_half_six_offset_0[i], false), + opnd_create_vector_base_disp_aarch64(Zn_six_offset_2[i], Xn_six_offset_3[i], + OPSZ_4, DR_EXTEND_UXTX, 0, 0, 0, + OPSZ_32, 0)); +} + +TEST_INSTR(stnt1b_sve_pred) +{ + + /* Testing STNT1B { .D }, , [.D{, }] */ + const char *const expected_0_0[6] = { + "stnt1b %z0.d %p0 -> (%z0.d,%x0)[4byte]", + "stnt1b %z5.d %p2 -> (%z7.d,%x8)[4byte]", + "stnt1b %z10.d %p3 -> (%z12.d,%x13)[4byte]", + "stnt1b %z16.d %p5 -> (%z18.d,%x18)[4byte]", + "stnt1b %z21.d %p6 -> (%z23.d,%x23)[4byte]", + "stnt1b %z31.d %p7 -> (%z31.d,%x30)[4byte]", + }; + TEST_LOOP(stnt1b, stnt1b_sve_pred, 6, expected_0_0[i], + opnd_create_reg_element_vector(Zn_six_offset_0[i], OPSZ_8), + opnd_create_reg(Pn_half_six_offset_0[i]), + opnd_create_vector_base_disp_aarch64(Zn_six_offset_2[i], Xn_six_offset_3[i], + OPSZ_8, DR_EXTEND_UXTX, 0, 0, 0, + OPSZ_4, 0)); + + /* Testing STNT1B { .S }, , [.S{, }] */ + const char *const expected_1_0[6] = { + "stnt1b %z0.s %p0 -> (%z0.s,%x0)[8byte]", + "stnt1b %z5.s %p2 -> (%z7.s,%x8)[8byte]", + "stnt1b %z10.s %p3 -> (%z12.s,%x13)[8byte]", + "stnt1b %z16.s %p5 -> (%z18.s,%x18)[8byte]", + "stnt1b %z21.s %p6 -> (%z23.s,%x23)[8byte]", + "stnt1b %z31.s %p7 -> (%z31.s,%x30)[8byte]", + }; + TEST_LOOP(stnt1b, stnt1b_sve_pred, 6, expected_1_0[i], + opnd_create_reg_element_vector(Zn_six_offset_0[i], OPSZ_4), + opnd_create_reg(Pn_half_six_offset_0[i]), + opnd_create_vector_base_disp_aarch64(Zn_six_offset_2[i], Xn_six_offset_3[i], + OPSZ_4, DR_EXTEND_UXTX, 0, 0, 0, + OPSZ_8, 0)); +} + +TEST_INSTR(stnt1d_sve_pred) +{ + + /* Testing STNT1D { .D }, , [.D{, }] */ + const char *const expected_0_0[6] = { + "stnt1d %z0.d %p0 -> (%z0.d,%x0)[32byte]", + "stnt1d %z5.d %p2 -> (%z7.d,%x8)[32byte]", + "stnt1d %z10.d %p3 -> (%z12.d,%x13)[32byte]", + "stnt1d %z16.d %p5 -> (%z18.d,%x18)[32byte]", + "stnt1d %z21.d %p6 -> (%z23.d,%x23)[32byte]", + "stnt1d %z31.d %p7 -> (%z31.d,%x30)[32byte]", + }; + TEST_LOOP(stnt1d, stnt1d_sve_pred, 6, expected_0_0[i], + opnd_create_reg_element_vector(Zn_six_offset_0[i], OPSZ_8), + opnd_create_reg(Pn_half_six_offset_0[i]), + opnd_create_vector_base_disp_aarch64(Zn_six_offset_2[i], Xn_six_offset_3[i], + OPSZ_8, DR_EXTEND_UXTX, 0, 0, 0, + OPSZ_32, 0)); +} + +TEST_INSTR(stnt1h_sve_pred) +{ + + /* Testing STNT1H { .D }, , [.D{, }] */ + const char *const expected_0_0[6] = { + "stnt1h %z0.d %p0 -> (%z0.d,%x0)[8byte]", + "stnt1h %z5.d %p2 -> (%z7.d,%x8)[8byte]", + "stnt1h %z10.d %p3 -> (%z12.d,%x13)[8byte]", + "stnt1h %z16.d %p5 -> (%z18.d,%x18)[8byte]", + "stnt1h %z21.d %p6 -> (%z23.d,%x23)[8byte]", + "stnt1h %z31.d %p7 -> (%z31.d,%x30)[8byte]", + }; + TEST_LOOP(stnt1h, stnt1h_sve_pred, 6, expected_0_0[i], + opnd_create_reg_element_vector(Zn_six_offset_0[i], OPSZ_8), + opnd_create_reg(Pn_half_six_offset_0[i]), + opnd_create_vector_base_disp_aarch64(Zn_six_offset_2[i], Xn_six_offset_3[i], + OPSZ_8, DR_EXTEND_UXTX, 0, 0, 0, + OPSZ_8, 0)); + + /* Testing STNT1H { .S }, , [.S{, }] */ + const char *const expected_1_0[6] = { + "stnt1h %z0.s %p0 -> (%z0.s,%x0)[16byte]", + "stnt1h %z5.s %p2 -> (%z7.s,%x8)[16byte]", + "stnt1h %z10.s %p3 -> (%z12.s,%x13)[16byte]", + "stnt1h %z16.s %p5 -> (%z18.s,%x18)[16byte]", + "stnt1h %z21.s %p6 -> (%z23.s,%x23)[16byte]", + "stnt1h %z31.s %p7 -> (%z31.s,%x30)[16byte]", + }; + TEST_LOOP(stnt1h, stnt1h_sve_pred, 6, expected_1_0[i], + opnd_create_reg_element_vector(Zn_six_offset_0[i], OPSZ_4), + opnd_create_reg(Pn_half_six_offset_0[i]), + opnd_create_vector_base_disp_aarch64(Zn_six_offset_2[i], Xn_six_offset_3[i], + OPSZ_4, DR_EXTEND_UXTX, 0, 0, 0, + OPSZ_16, 0)); +} + +TEST_INSTR(stnt1w_sve_pred) +{ + + /* Testing STNT1W { .D }, , [.D{, }] */ + const char *const expected_0_0[6] = { + "stnt1w %z0.d %p0 -> (%z0.d,%x0)[16byte]", + "stnt1w %z5.d %p2 -> (%z7.d,%x8)[16byte]", + "stnt1w %z10.d %p3 -> (%z12.d,%x13)[16byte]", + "stnt1w %z16.d %p5 -> (%z18.d,%x18)[16byte]", + "stnt1w %z21.d %p6 -> (%z23.d,%x23)[16byte]", + "stnt1w %z31.d %p7 -> (%z31.d,%x30)[16byte]", + }; + TEST_LOOP(stnt1w, stnt1w_sve_pred, 6, expected_0_0[i], + opnd_create_reg_element_vector(Zn_six_offset_0[i], OPSZ_8), + opnd_create_reg(Pn_half_six_offset_0[i]), + opnd_create_vector_base_disp_aarch64(Zn_six_offset_2[i], Xn_six_offset_3[i], + OPSZ_8, DR_EXTEND_UXTX, 0, 0, 0, + OPSZ_16, 0)); + + /* Testing STNT1W { .S }, , [.S{, }] */ + const char *const expected_1_0[6] = { + "stnt1w %z0.s %p0 -> (%z0.s,%x0)[32byte]", + "stnt1w %z5.s %p2 -> (%z7.s,%x8)[32byte]", + "stnt1w %z10.s %p3 -> (%z12.s,%x13)[32byte]", + "stnt1w %z16.s %p5 -> (%z18.s,%x18)[32byte]", + "stnt1w %z21.s %p6 -> (%z23.s,%x23)[32byte]", + "stnt1w %z31.s %p7 -> (%z31.s,%x30)[32byte]", + }; + TEST_LOOP(stnt1w, stnt1w_sve_pred, 6, expected_1_0[i], + opnd_create_reg_element_vector(Zn_six_offset_0[i], OPSZ_4), + opnd_create_reg(Pn_half_six_offset_0[i]), + opnd_create_vector_base_disp_aarch64(Zn_six_offset_2[i], Xn_six_offset_3[i], + OPSZ_4, DR_EXTEND_UXTX, 0, 0, 0, + OPSZ_32, 0)); +} int main(int argc, char *argv[]) { @@ -8603,6 +8859,15 @@ main(int argc, char *argv[]) RUN_INSTR_TEST(whilerw_sve); RUN_INSTR_TEST(whilewr_sve); + RUN_INSTR_TEST(ldnt1b_sve_pred); + RUN_INSTR_TEST(ldnt1d_sve_pred); + RUN_INSTR_TEST(ldnt1h_sve_pred); + RUN_INSTR_TEST(ldnt1w_sve_pred); + RUN_INSTR_TEST(stnt1b_sve_pred); + RUN_INSTR_TEST(stnt1d_sve_pred); + RUN_INSTR_TEST(stnt1h_sve_pred); + RUN_INSTR_TEST(stnt1w_sve_pred); + print("All SVE2 tests complete.\n"); #ifndef STANDALONE_DECODER dr_standalone_exit();