diff --git a/clients/drcachesim/analyzer_multi.cpp b/clients/drcachesim/analyzer_multi.cpp index c35d4064100..9310abe2b02 100644 --- a/clients/drcachesim/analyzer_multi.cpp +++ b/clients/drcachesim/analyzer_multi.cpp @@ -262,6 +262,7 @@ analyzer_multi_t::init_dynamic_schedule() sched_ops.syscall_switch_threshold = op_sched_syscall_switch_us.get_value(); sched_ops.blocking_switch_threshold = op_sched_blocking_switch_us.get_value(); sched_ops.block_time_scale = op_sched_block_scale.get_value(); + sched_ops.block_time_max = op_sched_block_max_us.get_value(); #ifdef HAS_ZIP if (!op_record_file.get_value().empty()) { record_schedule_zip_.reset(new zipfile_ostream_t(op_record_file.get_value())); diff --git a/clients/drcachesim/common/options.cpp b/clients/drcachesim/common/options.cpp index 74adf921095..52f8660efb6 100644 --- a/clients/drcachesim/common/options.cpp +++ b/clients/drcachesim/common/options.cpp @@ -851,11 +851,18 @@ droption_t op_sched_blocking_switch_us( "maybe-blocking to incur a context switch. Applies to -core_sharded and " "-core_serial. "); -droption_t - op_sched_block_scale(DROPTION_SCOPE_ALL, "sched_block_scale", 1., - "Input block time scale factor", - "A higher value here results in blocking syscalls " - "keeping inputs unscheduled for longer."); +droption_t op_sched_block_scale( + DROPTION_SCOPE_ALL, "sched_block_scale", 1000., "Input block time scale factor", + "The scale applied to the microsecond latency of blocking system calls. A higher " + "value here results in blocking syscalls keeping inputs unscheduled for longer. " + "This should roughly equal the slowdown of instruction record processing versus the " + "original (untraced) application execution."); + +droption_t op_sched_block_max_us(DROPTION_SCOPE_ALL, "sched_block_max_us", + 25000000, + "Maximum blocked input time, in microseconds", + "The maximum blocked time, after scaling with " + "-sched_block_scale."); #ifdef HAS_ZIP droption_t op_record_file(DROPTION_SCOPE_FRONTEND, "record_file", "", "Path for storing record of schedule", @@ -875,8 +882,8 @@ droption_t // Schedule_stats options. droption_t - op_schedule_stats_print_every(DROPTION_SCOPE_ALL, "schedule_stats_print_every", 5000, - "A letter is printed every N instrs", + op_schedule_stats_print_every(DROPTION_SCOPE_ALL, "schedule_stats_print_every", + 500000, "A letter is printed every N instrs", "A letter is printed every N instrs or N waits"); droption_t op_syscall_template_file( diff --git a/clients/drcachesim/common/options.h b/clients/drcachesim/common/options.h index 0a3a999ae55..dbf1c57ca47 100644 --- a/clients/drcachesim/common/options.h +++ b/clients/drcachesim/common/options.h @@ -194,6 +194,7 @@ extern dynamorio::droption::droption_t op_sched_order_time; extern dynamorio::droption::droption_t op_sched_syscall_switch_us; extern dynamorio::droption::droption_t op_sched_blocking_switch_us; extern dynamorio::droption::droption_t op_sched_block_scale; +extern dynamorio::droption::droption_t op_sched_block_max_us; #ifdef HAS_ZIP extern dynamorio::droption::droption_t op_record_file; extern dynamorio::droption::droption_t op_replay_file; diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp index 393d42a40a9..a75fd13eba2 100644 --- a/clients/drcachesim/scheduler/scheduler.cpp +++ b/clients/drcachesim/scheduler/scheduler.cpp @@ -1576,10 +1576,12 @@ scheduler_tmpl_t::add_to_ready_queue(input_info_t *input VPRINT( this, 4, "add_to_ready_queue (pre-size %zu): input %d priority %d timestamp delta %" PRIu64 - " block factor %3.2f time %" PRIu64 "\n", + " block time %" PRIu64 " start time %" PRIu64 "\n", ready_priority_.size(), input->index, input->priority, - input->reader->get_last_timestamp() - input->base_timestamp, - input->block_time_factor, input->blocked_start_time); + input->reader->get_last_timestamp() - input->base_timestamp, input->blocked_time, + input->blocked_start_time); + if (input->blocked_time > 0) + ++num_blocked_; input->queue_counter = ++ready_counter_; ready_priority_.push(input); } @@ -1593,6 +1595,7 @@ scheduler_tmpl_t::pop_from_ready_queue( std::set blocked; input_info_t *res = nullptr; sched_type_t::stream_status_t status = STATUS_OK; + uint64_t cur_time = (num_blocked_ > 0) ? get_output_time(for_output) : 0; while (!ready_priority_.empty()) { res = ready_priority_.top(); ready_priority_.pop(); @@ -1600,33 +1603,19 @@ scheduler_tmpl_t::pop_from_ready_queue( // For blocked inputs, as we don't have interrupts or other regular // control points we only check for being unblocked when an input // would be chosen to run. We thus keep blocked inputs in the ready queue. - if (options_.quantum_unit == QUANTUM_TIME) { - if (res->block_time_factor > 0 && - outputs_[for_output].cur_time - res->blocked_start_time < - options_.block_time_scale * res->block_time_factor) { - VPRINT( - this, 4, - "pop queue: %d still blocked for %4.1f (%3.2f * %3.2f - (%" PRIu64 - " - %" PRIu64 ")\n", - res->index, - options_.block_time_scale * res->block_time_factor - - (outputs_[for_output].cur_time - res->blocked_start_time), - options_.block_time_scale, res->block_time_factor, - outputs_[for_output].cur_time, res->blocked_start_time); - // We keep searching for a suitable input. - blocked.insert(res); - } else - break; - } else { - if (res->block_time_factor > 0) { - VPRINT(this, 4, "pop queue: %d still blocked for %4.1f\n", res->index, - res->block_time_factor); - --res->block_time_factor; - // We keep searching for a suitable input. - blocked.insert(res); - } else - break; + if (res->blocked_time > 0) { + assert(cur_time > 0); + --num_blocked_; } + if (res->blocked_time > 0 && + cur_time - res->blocked_start_time < res->blocked_time) { + VPRINT(this, 4, "pop queue: %d still blocked for %" PRIu64 "\n", + res->index, + res->blocked_time - (cur_time - res->blocked_start_time)); + // We keep searching for a suitable input. + blocked.insert(res); + } else + break; } else { // We keep searching for a suitable input. skipped.insert(res); @@ -1645,13 +1634,21 @@ scheduler_tmpl_t::pop_from_ready_queue( // Re-add the blocked ones to the back. for (input_info_t *save : blocked) add_to_ready_queue(save); + VDO(this, 1, { + static int heartbeat; + if (++heartbeat % 500 == 0) { + VPRINT(this, 1, "heartbeat[%d] %zd in queue; %d blocked => %d %d\n", + for_output, ready_priority_.size(), num_blocked_, + res == nullptr ? -1 : res->index, status); + } + }); if (res != nullptr) { VPRINT(this, 4, "pop_from_ready_queue[%d] (post-size %zu): input %d priority %d timestamp " "delta %" PRIu64 "\n", for_output, ready_priority_.size(), res->index, res->priority, res->reader->get_last_timestamp() - res->base_timestamp); - res->block_time_factor = 0.; + res->blocked_time = 0; } new_input = res; return status; @@ -1660,7 +1657,7 @@ scheduler_tmpl_t::pop_from_ready_queue( template bool scheduler_tmpl_t::syscall_incurs_switch(input_info_t *input, - double &block_time_factor) + uint64_t &blocked_time) { uint64_t post_time = input->reader->get_last_timestamp(); assert(input->processing_syscall || input->processing_maybe_blocking_syscall); @@ -1668,7 +1665,7 @@ scheduler_tmpl_t::syscall_incurs_switch(input_info_t *in // This is a legacy trace that does not have timestamps bracketing syscalls. // We switch on every maybe-blocking syscall in this case and have a simplified // blocking model. - block_time_factor = 1.; + blocked_time = options_.blocking_switch_threshold; return input->processing_maybe_blocking_syscall; } assert(input->pre_syscall_timestamp > 0); @@ -1677,13 +1674,22 @@ scheduler_tmpl_t::syscall_incurs_switch(input_info_t *in uint64_t threshold = input->processing_maybe_blocking_syscall ? options_.blocking_switch_threshold : options_.syscall_switch_threshold; - block_time_factor = static_cast(latency) / threshold; - if (options_.quantum_unit == QUANTUM_INSTRUCTIONS) - block_time_factor *= options_.block_time_scale; - VPRINT(this, 3, "input %d %ssyscall latency %" PRIu64 " => factor %4.1f\n", + blocked_time = + static_cast(static_cast(latency) * options_.block_time_scale); + if (blocked_time > options_.block_time_max) { + // We have a max to avoid outlier latencies that are already a second or + // more from scaling up to tens of minutes. We assume a cap is representative + // as the outliers likely were not part of key dependence chains. Without a + // cap the other threads all finish and the simulation waits for tens of + // minutes further for a couple of outliers. + blocked_time = options_.block_time_max; + } + VPRINT(this, 3, + "input %d %ssyscall latency %" PRIu64 " * scale %5.1f => blocked time %" PRIu64 + "\n", input->index, input->processing_maybe_blocking_syscall ? "maybe-blocking " : "", latency, - block_time_factor); + options_.block_time_scale, blocked_time); return latency >= threshold; } @@ -1877,7 +1883,7 @@ scheduler_tmpl_t::pick_next_input_as_previously( template typename scheduler_tmpl_t::stream_status_t scheduler_tmpl_t::pick_next_input(output_ordinal_t output, - double block_time_factor) + uint64_t blocked_time) { sched_type_t::stream_status_t res = sched_type_t::STATUS_OK; bool need_lock = @@ -1917,14 +1923,13 @@ scheduler_tmpl_t::pick_next_input(output_ordinal_t outpu if (res != sched_type_t::STATUS_OK) return res; } else if (options_.mapping == MAP_TO_ANY_OUTPUT) { - if (block_time_factor > 0. && prev_index != INVALID_INPUT_ORDINAL) { + if (blocked_time > 0 && prev_index != INVALID_INPUT_ORDINAL) { std::lock_guard lock(*inputs_[prev_index].lock); - if (inputs_[prev_index].block_time_factor == 0.) { - VPRINT(this, 2, "next_record[%d]: block time factor %3.2f\n", - output, block_time_factor); - inputs_[prev_index].block_time_factor = block_time_factor; - inputs_[prev_index].blocked_start_time = - outputs_[output].cur_time; + if (inputs_[prev_index].blocked_time == 0) { + VPRINT(this, 2, "next_record[%d]: blocked time %" PRIu64 "\n", + output, blocked_time); + inputs_[prev_index].blocked_time = blocked_time; + inputs_[prev_index].blocked_start_time = get_output_time(output); } else { // If we looped we could have the same prev_index. assert(iters > 1); @@ -1942,13 +1947,13 @@ scheduler_tmpl_t::pick_next_input(output_ordinal_t outpu ready_priority_.erase(target); index = target->index; // Erase any remaining wait time for the target. - if (target->block_time_factor != 0.) { + if (target->blocked_time > 0) { VPRINT(this, 3, - "next_record[%d]: direct switch erasing block time " - "factor " - "%4.1f for input %d\n", - output, target->block_time_factor, target->index); - target->block_time_factor = 0.; + "next_record[%d]: direct switch erasing blocked time " + "for input %d\n", + output, target->index); + --num_blocked_; + target->blocked_time = 0; } } else { // TODO i#5843: If the target is running on another output, we @@ -1965,13 +1970,14 @@ scheduler_tmpl_t::pick_next_input(output_ordinal_t outpu } if (index != INVALID_INPUT_ORDINAL) { // We found a direct switch target above. - } else if (ready_queue_empty() && block_time_factor == 0.) { + } else if (ready_queue_empty() && blocked_time == 0) { if (prev_index == INVALID_INPUT_ORDINAL) return eof_or_idle(output); - std::lock_guard lock(*inputs_[prev_index].lock); - if (inputs_[prev_index].at_eof) + auto lock = std::unique_lock(*inputs_[prev_index].lock); + if (inputs_[prev_index].at_eof) { + lock.unlock(); return eof_or_idle(output); - else + } else index = prev_index; // Go back to prior. } else { // Give up the input before we go to the queue so we can add @@ -1997,8 +2003,7 @@ scheduler_tmpl_t::pick_next_input(output_ordinal_t outpu return status; } if (queue_next == nullptr) { - assert(block_time_factor == 0. || - prev_index == INVALID_INPUT_ORDINAL); + assert(blocked_time == 0 || prev_index == INVALID_INPUT_ORDINAL); return eof_or_idle(output); } index = queue_next->index; @@ -2070,6 +2075,12 @@ scheduler_tmpl_t::next_record(output_ordinal_t output, // We do not enforce a globally increasing time to avoid the synchronization cost; we // do return an error on a time smaller than an input's current start time when we // check for quantum end. + if (cur_time == 0) { + // It's more efficient for QUANTUM_TIME to get the time here instead of + // in get_output_time(). This also makes the two more similarly behaved + // with respect to blocking system calls. + cur_time = get_time_micros(); + } outputs_[output].cur_time = cur_time; // Invalid values are checked below. if (!outputs_[output].active) return sched_type_t::STATUS_IDLE; @@ -2170,7 +2181,7 @@ scheduler_tmpl_t::next_record(output_ordinal_t output, VDO(this, 5, print_record(record);); bool need_new_input = false; bool preempt = false; - double block_time_factor = 0.; + uint64_t blocked_time = 0; uint64_t prev_time_in_quantum = 0; if (options_.mapping == MAP_AS_PREVIOUSLY) { assert(outputs_[output].record_index >= 0); @@ -2235,7 +2246,7 @@ scheduler_tmpl_t::next_record(output_ordinal_t output, input->switch_to_input = it->second; } } else if (record_type_is_instr(record)) { - if (syscall_incurs_switch(input, block_time_factor)) { + if (syscall_incurs_switch(input, blocked_time)) { // Model as blocking and should switch to a different input. need_new_input = true; VPRINT(this, 3, @@ -2323,8 +2334,7 @@ scheduler_tmpl_t::next_record(output_ordinal_t output, VPRINT(this, 5, "next_record[%d]: queuing candidate record\n", output); input->queue.push_back(record); lock.unlock(); - sched_type_t::stream_status_t res = - pick_next_input(output, block_time_factor); + sched_type_t::stream_status_t res = pick_next_input(output, blocked_time); if (res != sched_type_t::STATUS_OK && res != sched_type_t::STATUS_WAIT && res != sched_type_t::STATUS_SKIPPED) return res; @@ -2385,7 +2395,8 @@ scheduler_tmpl_t::next_record(output_ordinal_t output, } break; } - VPRINT(this, 4, "next_record[%d]: from %d: ", output, input->index); + VPRINT(this, 4, "next_record[%d]: from %d @%" PRId64 ": ", output, input->index, + cur_time); VDO(this, 4, print_record(record);); outputs_[output].last_record = record; @@ -2493,6 +2504,7 @@ scheduler_tmpl_t::eof_or_idle(output_ordinal_t output) assert(options_.mapping != MAP_AS_PREVIOUSLY || outputs_[output].at_eof); return sched_type_t::STATUS_EOF; } else { + set_cur_input(output, INVALID_INPUT_ORDINAL); outputs_[output].waiting = true; return sched_type_t::STATUS_IDLE; } diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h index ef218a9d350..b428480d654 100644 --- a/clients/drcachesim/scheduler/scheduler.h +++ b/clients/drcachesim/scheduler/scheduler.h @@ -524,16 +524,27 @@ template class scheduler_tmpl_t { uint64_t blocking_switch_threshold = 100; /** * Controls the amount of time inputs are considered blocked at a syscall whose - * latency exceeds #syscall_switch_threshold or #blocking_switch_threshold. A - * "block time factor" is computed from the syscall latency divided by either - * #syscall_switch_threshold or #blocking_switch_threshold. This factor is - * multiplied by this field #block_time_scale to produce a final value. For - * #QUANTUM_TIME, that final value's amount of time, as reported by the time - * parameter to next_record(), must pass before the input is no longer considered - * blocked. For instruction quanta, that final value's count of scheduler - * selections must occur before the input is actually selected. + * latency exceeds #syscall_switch_threshold or #blocking_switch_threshold. The + * syscall latency (in microseconds) is multiplied by this field to produce the + * blocked time. For #QUANTUM_TIME, that blocked time in the units reported by + * the time parameter to next_record() must pass before the input is no longer + * considered blocked. Since the system call latencies are in microseconds, this + * #block_time_scale should be set to the number of next_record() time units in + * one simulated microsecond. For #QUANTUM_INSTRUCTIONS, the blocked time in + * wall-clock microseconds must pass before the input is actually selected + * (wall-clock time is used as there is no reasonable alternative with no other + * uniform notion of time); thus, the #block_time_scale value here should equal + * the slowdown of the instruction record processing versus the original + * (untraced) application execution. The blocked time is clamped to a maximum + * value controlled by #block_time_max. */ - double block_time_scale = 1.; + double block_time_scale = 1000.; + /** + * The maximum time, in microseconds, for an input to be considered blocked + * for any one system call. This is applied after multiplying by + * #block_time_scale. + */ + uint64_t block_time_max = 25000000; }; /** @@ -1018,8 +1029,9 @@ template class scheduler_tmpl_t { uint64_t prev_time_in_quantum = 0; uint64_t time_spent_in_quantum = 0; // These fields model waiting at a blocking syscall. - double block_time_factor = 0.; - uint64_t blocked_start_time = 0; // For QUANTUM_TIME only. + // The units are us for instr quanta and simuilation time for time quanta. + uint64_t blocked_time = 0.; + uint64_t blocked_start_time = 0; }; // Format for recording a schedule to disk. A separate sequence of these records @@ -1232,7 +1244,7 @@ template class scheduler_tmpl_t { // Finds the next input stream for the 'output_ordinal'-th output stream. // No input_info_t lock can be held on entry. stream_status_t - pick_next_input(output_ordinal_t output, double block_time_factor); + pick_next_input(output_ordinal_t output, uint64_t blocked_time); // Helper for pick_next_input() for MAP_AS_PREVIOUSLY. // No input_info_t lock can be held on entry. @@ -1354,7 +1366,7 @@ template class scheduler_tmpl_t { // The input's lock must be held by the caller. // Returns a multiplier for how long the input should be considered blocked. bool - syscall_incurs_switch(input_info_t *input, double &block_time_factor); + syscall_incurs_switch(input_info_t *input, uint64_t &blocked_time); // sched_lock_ must be held by the caller. // "for_output" is which output stream is looking for a new input; only an @@ -1385,6 +1397,8 @@ template class scheduler_tmpl_t { // timestamp in each workload in order to mix inputs from different workloads in the // same queue. FIFO ordering is used for same-priority entries. flexible_queue_t ready_priority_; + // Trackes the count of blocked inputs. Protected by sched_lock_. + int num_blocked_ = 0; // Global ready queue counter used to provide FIFO for same-priority inputs. uint64_t ready_counter_ = 0; // Count of inputs not yet at eof. diff --git a/clients/drcachesim/tests/schedule_stats_nopreempt.templatex b/clients/drcachesim/tests/schedule_stats_nopreempt.templatex index 8eeada43d63..c5c98e0e9c0 100644 --- a/clients/drcachesim/tests/schedule_stats_nopreempt.templatex +++ b/clients/drcachesim/tests/schedule_stats_nopreempt.templatex @@ -3,10 +3,10 @@ Total counts: 4 cores 8 threads 638938 instructions - 5 total context switches - 0\.0078255 CSPKI \(context switches per 1000 instructions\) - 127788 instructions per context switch - 5 voluntary context switches + 6 total context switches + 0\.0093906 CSPKI \(context switches per 1000 instructions\) + 106490 instructions per context switch + 6 voluntary context switches 0 direct context switches 100\.00% voluntary switches 0\.00% direct switches @@ -15,7 +15,12 @@ Total counts: 0 direct switch requests 0 waits *[0-9]* idles - *[0-9\.]*% cpu busy + *[0-9\.]*% cpu busy by record count + *[0-9]* cpu microseconds + *[0-9]* idle microseconds + *[0-9]* idle microseconds at last instr + *[0-9\.]*% cpu busy by time + *[0-9\.]*% cpu busy by time, ignoring idle past last instr Core #0 counts: . threads *[0-9]* instructions @@ -31,7 +36,12 @@ Core #0 counts: 0 direct switch requests 0 waits *[0-9]* idles - *[0-9\.]*% cpu busy + *[0-9\.]*% cpu busy by record count + *[0-9]* cpu microseconds + *[0-9]* idle microseconds + *[0-9]* idle microseconds at last instr + *[0-9\.]*% cpu busy by time + *[0-9\.]*% cpu busy by time, ignoring idle past last instr Core #1 counts: . threads *[0-9]* instructions @@ -47,7 +57,12 @@ Core #1 counts: 0 direct switch requests 0 waits *[0-9]* idles - *[0-9\.]*% cpu busy + *[0-9\.]*% cpu busy by record count + *[0-9]* cpu microseconds + *[0-9]* idle microseconds + *[0-9]* idle microseconds at last instr + *[0-9\.]*% cpu busy by time + *[0-9\.]*% cpu busy by time, ignoring idle past last instr Core #2 counts: . threads *[0-9]* instructions @@ -63,7 +78,12 @@ Core #2 counts: 0 direct switch requests 0 waits *[0-9]* idles - *[0-9\.]*% cpu busy + *[0-9\.]*% cpu busy by record count + *[0-9]* cpu microseconds + *[0-9]* idle microseconds + *[0-9]* idle microseconds at last instr + *[0-9\.]*% cpu busy by time + *[0-9\.]*% cpu busy by time, ignoring idle past last instr Core #3 counts: . threads *[0-9]* instructions @@ -79,7 +99,12 @@ Core #3 counts: 0 direct switch requests 0 waits *[0-9]* idles - *[0-9\.]*% cpu busy + *[0-9\.]*% cpu busy by record count + *[0-9]* cpu microseconds + *[0-9]* idle microseconds + *[0-9]* idle microseconds at last instr + *[0-9\.]*% cpu busy by time + *[0-9\.]*% cpu busy by time, ignoring idle past last instr Core #0 schedule: [A-Ha-h_]* Core #1 schedule: [A-Ha-h_]* Core #2 schedule: [A-Ha-h_]* diff --git a/clients/drcachesim/tests/schedule_stats_test.cpp b/clients/drcachesim/tests/schedule_stats_test.cpp index 48177635375..86e22894ce1 100644 --- a/clients/drcachesim/tests/schedule_stats_test.cpp +++ b/clients/drcachesim/tests/schedule_stats_test.cpp @@ -196,6 +196,8 @@ test_basic_stats() assert(result.maybe_blocking_syscalls == 3); assert(result.direct_switch_requests == 2); assert(result.waits == 3); + assert(result.idle_microseconds == 0); + assert(result.cpu_microseconds > 0); return true; } @@ -216,6 +218,7 @@ test_idle() gen_marker(TID_B, TRACE_MARKER_TYPE_CORE_IDLE, 0), gen_marker(TID_B, TRACE_MARKER_TYPE_CORE_IDLE, 0), gen_marker(TID_B, TRACE_MARKER_TYPE_CORE_IDLE, 0), + // A switch from idle w/ no syscall is an involuntary switch. gen_instr(TID_B), gen_instr(TID_B), gen_instr(TID_B), @@ -229,6 +232,7 @@ test_idle() gen_marker(TID_C, TRACE_MARKER_TYPE_CORE_IDLE, 0), gen_marker(TID_C, TRACE_MARKER_TYPE_CORE_IDLE, 0), gen_marker(TID_C, TRACE_MARKER_TYPE_CORE_IDLE, 0), + // A switch from idle w/ no syscall is an involuntary switch. gen_instr(TID_C), gen_instr(TID_C), // Wait. @@ -243,7 +247,7 @@ test_idle() }; auto result = run_schedule_stats(memrefs, tid2ord); assert(result.instrs == 13); - assert(result.total_switches == 3); + assert(result.total_switches == 5); assert(result.voluntary_switches == 0); assert(result.direct_switches == 0); assert(result.syscalls == 0); @@ -251,6 +255,11 @@ test_idle() assert(result.direct_switch_requests == 0); assert(result.waits == 3); assert(result.idles == 6); + // It is hard to test wall-clock time precise values so we have sanity checks. + assert(result.idle_microseconds > 0); + assert(result.idle_micros_at_last_instr > 0 && + result.idle_micros_at_last_instr <= result.idle_microseconds); + assert(result.cpu_microseconds > 0); return true; } diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp index d41d2119798..b8155671696 100644 --- a/clients/drcachesim/tests/scheduler_unit_tests.cpp +++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp @@ -815,6 +815,7 @@ run_lockstep_simulation(scheduler_t &scheduler, int num_outputs, memref_tid_t ti for (int i = 0; i < num_outputs; i++) outputs[i] = scheduler.get_stream(i); int num_eof = 0; + int64_t meta_records = 0; // Record the threads, one char each. std::vector sched_as_string(num_outputs); static constexpr char THREAD_LETTER_START = 'A'; @@ -833,8 +834,9 @@ run_lockstep_simulation(scheduler_t &scheduler, int num_outputs, memref_tid_t ti // instruction quanta. This is a per-output time which technically // violates the globally-increasing requirement, so this will not work // perfectly with i/o waits, but should work fine for basic tests. + // We add the wait and idle records to make progress with idle time. status = outputs[i]->next_record( - memref, outputs[i]->get_instruction_ordinal() + 1); + memref, outputs[i]->get_instruction_ordinal() + 1 + meta_records); } else { status = outputs[i]->next_record(memref); } @@ -845,10 +847,12 @@ run_lockstep_simulation(scheduler_t &scheduler, int num_outputs, memref_tid_t ti } if (status == scheduler_t::STATUS_WAIT) { sched_as_string[i] += WAIT_SYMBOL; + ++meta_records; continue; } if (status == scheduler_t::STATUS_IDLE) { sched_as_string[i] += IDLE_SYMBOL; + ++meta_records; continue; } assert(status == scheduler_t::STATUS_OK); @@ -898,7 +902,8 @@ test_synthetic() static constexpr int NUM_OUTPUTS = 2; static constexpr int NUM_INSTRS = 9; static constexpr int QUANTUM_DURATION = 3; - static constexpr double BLOCK_SCALE = 0.1; + // We do not want to block for very long. + static constexpr double BLOCK_SCALE = 0.01; static constexpr memref_tid_t TID_BASE = 100; std::vector inputs[NUM_INPUTS]; for (int i = 0; i < NUM_INPUTS; i++) { @@ -927,9 +932,10 @@ test_synthetic() // the usage to persist to their next scheduling which should only have // a single letter. static const char *const CORE0_SCHED_STRING = - "..AA......CCC..EEE..GGGEEEABGGGDDD.AAABBBAAA.___"; + "..AA......CCC..EEE..GGGDDDFFFBBBCCC.EEE.AAA.GGG."; static const char *const CORE1_SCHED_STRING = - "..BB......DDD..FFFCCCDDDFFFCCC.EEE.FFF.GGG.BBB."; + "..BB......DDD..FFFABCCCEEEAAAGGGDDD.FFF.BBB.____"; + { // Test instruction quanta. std::vector sched_inputs; @@ -945,7 +951,6 @@ test_synthetic() scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/3); sched_ops.quantum_duration = QUANTUM_DURATION; - // We do not want to block for very long. sched_ops.block_time_scale = BLOCK_SCALE; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, sched_ops) != @@ -975,8 +980,7 @@ test_synthetic() /*verbosity=*/3); sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; sched_ops.quantum_duration = QUANTUM_DURATION; - // QUANTUM_INSTRUCTIONS divides by the threshold so to match we multiply. - sched_ops.block_time_scale = sched_ops.blocking_switch_threshold * BLOCK_SCALE; + sched_ops.block_time_scale = BLOCK_SCALE; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, sched_ops) != scheduler_t::STATUS_SUCCESS) @@ -1002,6 +1006,8 @@ test_synthetic_time_quanta() static constexpr memref_tid_t TID_C = TID_A + 2; static constexpr int NUM_OUTPUTS = 2; static constexpr int NUM_INPUTS = 3; + static constexpr int PRE_BLOCK_TIME = 20; + static constexpr int POST_BLOCK_TIME = 220; std::vector refs[NUM_INPUTS]; for (int i = 0; i < NUM_INPUTS; ++i) { refs[i].push_back(make_thread(TID_BASE + i)); @@ -1011,10 +1017,10 @@ test_synthetic_time_quanta() refs[i].push_back(make_instr(10)); refs[i].push_back(make_instr(30)); if (i == 0) { - refs[i].push_back(make_timestamp(20)); + refs[i].push_back(make_timestamp(PRE_BLOCK_TIME)); refs[i].push_back(make_marker(TRACE_MARKER_TYPE_SYSCALL, 42)); refs[i].push_back(make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0)); - refs[i].push_back(make_timestamp(220)); + refs[i].push_back(make_timestamp(POST_BLOCK_TIME)); } refs[i].push_back(make_instr(50)); refs[i].push_back(make_exit(TID_BASE + i)); @@ -1037,7 +1043,8 @@ test_synthetic_time_quanta() /*verbosity=*/4); sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; sched_ops.quantum_duration = 3; - sched_ops.block_time_scale = 5.; // Ensure it waits a while. + // Ensure it waits 10 steps. + sched_ops.block_time_scale = 10. / (POST_BLOCK_TIME - PRE_BLOCK_TIME); zipfile_ostream_t outfile(record_fname); sched_ops.schedule_record_ostream = &outfile; if (scheduler.init(sched_inputs, NUM_OUTPUTS, sched_ops) != @@ -1454,6 +1461,8 @@ test_synthetic_with_syscalls_multiple() static constexpr int NUM_OUTPUTS = 2; static constexpr int NUM_INSTRS = 9; static constexpr memref_tid_t TID_BASE = 100; + static constexpr int BLOCK_LATENCY = 100; + static constexpr double BLOCK_SCALE = 1. / (BLOCK_LATENCY); std::vector sched_inputs; auto get_tid = [&](int workload_idx, int input_idx) { return TID_BASE + workload_idx * NUM_INPUTS_PER_WORKLOAD + input_idx; @@ -1485,8 +1494,8 @@ test_synthetic_with_syscalls_multiple() inputs.push_back(make_marker(TRACE_MARKER_TYPE_SYSCALL, 42)); inputs.push_back( make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0)); - // Blocked for 5 100-threshold blocking-syscall units. - inputs.push_back(make_timestamp(stamp + 550)); + // Blocked for 10 time units with our BLOCK_SCALE. + inputs.push_back(make_timestamp(stamp + 10 + 10 * BLOCK_LATENCY)); } else { // Insert meta records to keep the locksteps lined up. inputs.push_back(make_marker(TRACE_MARKER_TYPE_CPU_ID, 0)); @@ -1529,13 +1538,17 @@ test_synthetic_with_syscalls_multiple() scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/3); sched_ops.quantum_duration = 3; + // We use our mock's time==instruction count for a deterministic result. + sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; + sched_ops.blocking_switch_threshold = BLOCK_LATENCY; + sched_ops.block_time_scale = BLOCK_SCALE; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, sched_ops) != scheduler_t::STATUS_SUCCESS) assert(false); // We omit the "." marker chars to keep the strings short enough to be readable. std::vector sched_as_string = run_lockstep_simulation( - scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/false, /*print_markers=*/false); + scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true, /*print_markers=*/false); for (int i = 0; i < NUM_OUTPUTS; i++) { std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n"; } @@ -1550,9 +1563,8 @@ test_synthetic_with_syscalls_multiple() // with the "." in run_lockstep_simulation(). The omitted "." markers also // explains why the two strings are different lengths. assert(sched_as_string[0] == - "BHHHFFFJJJJJJJBEEHHHIIIBIIIEEDDDBAAAEEGGGBDDD___B___B___B___B"); - assert(sched_as_string[1] == - "EECCCIIICCCJJFFFCCCFFFAAAHHHGGGDDDAAAGGGE__________________________"); + "BHHHFFFJJJJJJBEEHHHFFFBCCCEEIIIDDDBAAAGGGDDDB________B_______"); + assert(sched_as_string[1] == "EECCCIIICCCBEEJJJHHHIIIFFFEAAAGGGBDDDAAAGGG________B"); } static void @@ -1567,6 +1579,8 @@ test_synthetic_with_syscalls_single() static constexpr int NUM_OUTPUTS = 2; static constexpr int NUM_INSTRS = 9; static constexpr memref_tid_t TID_BASE = 100; + static constexpr int BLOCK_LATENCY = 100; + static constexpr double BLOCK_SCALE = 1. / (BLOCK_LATENCY); std::vector sched_inputs; auto get_tid = [&](int workload_idx, int input_idx) { return TID_BASE + workload_idx * NUM_INPUTS_PER_WORKLOAD + input_idx; @@ -1591,15 +1605,14 @@ test_synthetic_with_syscalls_single() inputs.push_back(make_timestamp(stamp)); } inputs.push_back(make_instr(42 + instr_idx * 4)); - // Insert some blocking syscalls in the high-priority (see below) - // middle threads. + // Insert some blocking syscalls. if (instr_idx % 3 == 1) { inputs.push_back(make_timestamp(stamp + 10)); inputs.push_back(make_marker(TRACE_MARKER_TYPE_SYSCALL, 42)); inputs.push_back( make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0)); - // Blocked for 3 100-threshold blocking-syscall units. - inputs.push_back(make_timestamp(stamp + 350)); + // Blocked for 3 time units. + inputs.push_back(make_timestamp(stamp + 10 + 3 * BLOCK_LATENCY)); } else { // Insert meta records to keep the locksteps lined up. inputs.push_back(make_marker(TRACE_MARKER_TYPE_CPU_ID, 0)); @@ -1621,20 +1634,22 @@ test_synthetic_with_syscalls_single() scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/4); sched_ops.quantum_duration = 3; + // We use our mock's time==instruction count for a deterministic result. + sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; + sched_ops.blocking_switch_threshold = BLOCK_LATENCY; + sched_ops.block_time_scale = BLOCK_SCALE; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, sched_ops) != scheduler_t::STATUS_SUCCESS) assert(false); std::vector sched_as_string = - run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE); + run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true); for (int i = 0; i < NUM_OUTPUTS; i++) { std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n"; } // We expect an idle CPU every 3 instrs but starting at the 2nd (1-based % 3). - assert(sched_as_string[0] == - "..A....A....__A....A.....A....__A.....A....A....__A....."); - assert(sched_as_string[1] == - "________________________________________________________"); + assert(sched_as_string[0] == "..A....A....___________________________________A....."); + assert(sched_as_string[1] == "____________A....A.....A....__A.....A....A...._______"); } static bool @@ -1756,6 +1771,8 @@ test_synthetic_with_syscalls_latencies() static constexpr memref_tid_t TID_A = 42; static constexpr memref_tid_t TID_B = 99; static constexpr int SYSNUM = 202; + static constexpr int BLOCK_LATENCY = 100; + static constexpr double BLOCK_SCALE = 1. / (BLOCK_LATENCY); std::vector refs_A = { /* clang-format off */ make_thread(TID_A), @@ -1779,7 +1796,7 @@ test_synthetic_with_syscalls_latencies() make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0), make_marker(TRACE_MARKER_TYPE_FUNC_ID, 100), make_marker(TRACE_MARKER_TYPE_FUNC_ARG, 42), - make_timestamp(1100), + make_timestamp(1000 + BLOCK_LATENCY), make_marker(TRACE_MARKER_TYPE_CPU_ID, 1), make_marker(TRACE_MARKER_TYPE_FUNC_ID, 100), make_marker(TRACE_MARKER_TYPE_FUNC_RETVAL, 0), @@ -1809,14 +1826,20 @@ test_synthetic_with_syscalls_latencies() scheduler_t::DEPENDENCY_TIMESTAMPS, scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/4); + // We use a mock time for a deterministic result. + sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; + sched_ops.blocking_switch_threshold = BLOCK_LATENCY; + sched_ops.block_time_scale = BLOCK_SCALE; scheduler_t scheduler; if (scheduler.init(sched_inputs, 1, sched_ops) != scheduler_t::STATUS_SUCCESS) assert(false); auto *stream = scheduler.get_stream(0); memref_t memref; std::vector refs; - for (scheduler_t::stream_status_t status = stream->next_record(memref); - status != scheduler_t::STATUS_EOF; status = stream->next_record(memref)) { + int step = 0; + for (scheduler_t::stream_status_t status = stream->next_record(memref, ++step); + status != scheduler_t::STATUS_EOF; + status = stream->next_record(memref, ++step)) { if (status == scheduler_t::STATUS_WAIT) continue; assert(status == scheduler_t::STATUS_OK); @@ -1867,6 +1890,9 @@ test_synthetic_with_syscalls_idle() static constexpr int NUM_OUTPUTS = 1; static constexpr int NUM_INSTRS = 12; static constexpr memref_tid_t TID_BASE = 100; + static constexpr int BLOCK_LATENCY = 100; + static constexpr double BLOCK_SCALE = 1. / (BLOCK_LATENCY); + static constexpr int BLOCK_UNITS = 27; std::vector sched_inputs; std::vector readers; for (int input_idx = 0; input_idx < NUM_INPUTS; input_idx++) { @@ -1886,15 +1912,17 @@ test_synthetic_with_syscalls_idle() inputs.push_back(make_marker(TRACE_MARKER_TYPE_SYSCALL, 42)); inputs.push_back( make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0)); - // Blocked for 2 100-threshold blocking-syscall units, but + // Blocked for BLOCK_UNITS time units with BLOCK_SCALE, but // after each queue rejection it should go to the back of // the queue and all the other inputs should be selected // before another retry. - inputs.push_back(make_timestamp(stamp + 210)); + inputs.push_back( + make_timestamp(stamp + 10 + BLOCK_UNITS * BLOCK_LATENCY)); } else { // Insert a timestamp to match the blocked input so the inputs // are all at equal priority in the queue. - inputs.push_back(make_timestamp(stamp + 210)); + inputs.push_back( + make_timestamp(stamp + 10 + BLOCK_UNITS * BLOCK_LATENCY)); } } } @@ -1908,20 +1936,23 @@ test_synthetic_with_syscalls_idle() scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/3); sched_ops.quantum_duration = 3; + // We use a mock time for a deterministic result. + sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; + sched_ops.blocking_switch_threshold = BLOCK_LATENCY; + sched_ops.block_time_scale = BLOCK_SCALE; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, sched_ops) != scheduler_t::STATUS_SUCCESS) assert(false); std::vector sched_as_string = - run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE); + run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true); for (int i = 0; i < NUM_OUTPUTS; i++) { std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n"; } // The timestamps provide the ABCD ordering, but A's blocking syscall after its - // 2nd instr makes it delayed for 3 full queue cycles of BCD BCD: A's duration - // of 2 is decremented after the 1st (to 1) and 2nd (to 0) and A is finally - // schedulable after the 3rd, when it just gets 1 instruction in before its - // (accumulated) count equals the quantum. + // 2nd instr makes it delayed for 3 full queue cycles of BBBCCCDDD (27 instrs, + // which is BLOCK_UNITS): A's is finally schedulable after the 3rd, when it just gets + // 1 instruction in before its (accumulated) count equals the quantum. assert(sched_as_string[0] == "..AA......BB.B..CC.C..DD.DBBBCCCDDDBBBCCCDDDABBB.CCC.DDD.AAAAAAAAA."); } @@ -2191,7 +2222,7 @@ test_replay() scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT, scheduler_t::DEPENDENCY_IGNORE, scheduler_t::SCHEDULER_DEFAULTS, - /*verbosity=*/2); + /*verbosity=*/3); sched_ops.quantum_duration = QUANTUM_INSTRS; zipfile_ostream_t outfile(record_fname); @@ -3360,6 +3391,8 @@ test_direct_switch() // We have just 1 output to better control the order and avoid flakiness. static constexpr int NUM_OUTPUTS = 1; static constexpr int QUANTUM_DURATION = 100; // Never reached. + static constexpr int BLOCK_LATENCY = 100; + static constexpr double BLOCK_SCALE = 1. / (BLOCK_LATENCY); static constexpr memref_tid_t TID_BASE = 100; static constexpr memref_tid_t TID_A = TID_BASE + 0; static constexpr memref_tid_t TID_B = TID_BASE + 1; @@ -3428,7 +3461,7 @@ test_direct_switch() // has significant blocked time left. But then after B is scheduled and finishes, // we still have to wait for C's block time so we see idle underscores: static const char *const CORE0_SCHED_STRING = - "...AA.........CC......A....BBBB.____________________C..."; + "...AA.........CC......A....BBBB.______________C..."; std::vector sched_inputs; sched_inputs.emplace_back(std::move(readers)); @@ -3437,12 +3470,16 @@ test_direct_switch() scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/3); sched_ops.quantum_duration = QUANTUM_DURATION; + // We use our mock's time==instruction count for a deterministic result. + sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; + sched_ops.blocking_switch_threshold = BLOCK_LATENCY; + sched_ops.block_time_scale = BLOCK_SCALE; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, sched_ops) != scheduler_t::STATUS_SUCCESS) assert(false); std::vector sched_as_string = - run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE); + run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true); for (int i = 0; i < NUM_OUTPUTS; i++) { std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n"; } @@ -3457,7 +3494,6 @@ test_main(int argc, const char *argv[]) // Avoid races with lazy drdecode init (b/279350357). dr_standalone_init(); - test_direct_switch(); test_serial(); test_parallel(); test_param_checks(); diff --git a/clients/drcachesim/tools/schedule_stats.cpp b/clients/drcachesim/tools/schedule_stats.cpp index 67bfe69b0e5..f73fbcd874b 100644 --- a/clients/drcachesim/tools/schedule_stats.cpp +++ b/clients/drcachesim/tools/schedule_stats.cpp @@ -34,6 +34,13 @@ #include "schedule_stats.h" +#ifdef WINDOWS +# define WIN32_LEAN_AND_MEAN +# include +#else +# include +#endif + #include #include @@ -123,6 +130,14 @@ bool schedule_stats_t::parallel_shard_exit(void *shard_data) { // Nothing (we read the shard data in print_results). + per_shard_t *shard = reinterpret_cast(shard_data); + if (shard->prev_was_idle) { + shard->counters.idle_microseconds += + get_current_microseconds() - shard->segment_start_microseconds; + } else if (!shard->prev_was_wait) { + shard->counters.cpu_microseconds += + get_current_microseconds() - shard->segment_start_microseconds; + } return true; } @@ -133,6 +148,25 @@ schedule_stats_t::parallel_shard_error(void *shard_data) return per_shard->error; } +uint64_t +schedule_stats_t::get_current_microseconds() +{ +#ifdef UNIX + struct timeval time; + if (gettimeofday(&time, nullptr) != 0) + return 0; + return time.tv_sec * 1000000 + time.tv_usec; +#else + SYSTEMTIME sys_time; + GetSystemTime(&sys_time); + FILETIME file_time; + if (!SystemTimeToFileTime(&sys_time, &file_time)) + return 0; + return file_time.dwLowDateTime + + (static_cast(file_time.dwHighDateTime) << 32); +#endif +} + bool schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref) { @@ -163,8 +197,10 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref // Cache and reset here to ensure we reset on early return paths. bool was_wait = shard->prev_was_wait; bool was_idle = shard->prev_was_idle; + int64_t prev_input = shard->prev_input; shard->prev_was_wait = false; shard->prev_was_idle = false; + shard->prev_input = -1; if (memref.marker.type == TRACE_TYPE_MARKER && memref.marker.marker_type == TRACE_MARKER_TYPE_CORE_WAIT) { ++shard->counters.waits; @@ -187,6 +223,11 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref if (!was_idle) { shard->thread_sequence += IDLE_SYMBOL; shard->cur_segment_instrs = 0; + if (!was_wait && shard->segment_start_microseconds > 0) { + shard->counters.idle_microseconds += + get_current_microseconds() - shard->segment_start_microseconds; + } + shard->segment_start_microseconds = get_current_microseconds(); } else { ++shard->cur_segment_instrs; if (shard->cur_segment_instrs == knob_print_every_) { @@ -197,7 +238,7 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref return true; } int64_t input = shard->stream->get_input_id(); - if (input != shard->prev_input) { + if (input != prev_input) { // We convert to letters which only works well for <=26 inputs. if (!shard->thread_sequence.empty()) { ++shard->counters.total_switches; @@ -209,6 +250,11 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref shard->thread_sequence += THREAD_LETTER_INITIAL_START + static_cast(input % 26); shard->cur_segment_instrs = 0; + if (!was_wait && !was_idle && shard->segment_start_microseconds > 0) { + shard->counters.cpu_microseconds += + get_current_microseconds() - shard->segment_start_microseconds; + } + shard->segment_start_microseconds = get_current_microseconds(); if (knob_verbose_ >= 2) { std::ostringstream line; line << "Core #" << std::setw(2) << shard->core << " @" << std::setw(9) @@ -226,11 +272,12 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref << " == thread " << memref.instr.tid << "\n"; std::cerr << line.str(); } - shard->prev_input = input; } + shard->prev_input = input; if (type_is_instr(memref.instr.type)) { ++shard->counters.instrs; ++shard->cur_segment_instrs; + shard->counters.idle_micros_at_last_instr = shard->counters.idle_microseconds; if (shard->cur_segment_instrs == knob_print_every_) { shard->thread_sequence += THREAD_LETTER_SUBSEQUENT_START + static_cast(input % 26); @@ -300,7 +347,22 @@ schedule_stats_t::print_counters(const counters_t &counters) std::cerr << std::setw(12) << std::setprecision(2) << 100 * (counters.instrs / static_cast(counters.instrs + counters.idles)) - << "% cpu busy\n"; + << "% cpu busy by record count\n"; + std::cerr << std::setw(12) << counters.cpu_microseconds << " cpu microseconds\n"; + std::cerr << std::setw(12) << counters.idle_microseconds << " idle microseconds\n"; + std::cerr << std::setw(12) << counters.idle_micros_at_last_instr + << " idle microseconds at last instr\n"; + std::cerr << std::setw(12) << std::setprecision(2) + << 100 * + (counters.cpu_microseconds / + static_cast(counters.cpu_microseconds + counters.idle_microseconds)) + << "% cpu busy by time\n"; + std::cerr << std::setw(12) << std::setprecision(2) + << 100 * + (counters.cpu_microseconds / + static_cast(counters.cpu_microseconds + + counters.idle_micros_at_last_instr)) + << "% cpu busy by time, ignoring idle past last instr\n"; } bool diff --git a/clients/drcachesim/tools/schedule_stats.h b/clients/drcachesim/tools/schedule_stats.h index 9d15ccc0638..f0e9425e39d 100644 --- a/clients/drcachesim/tools/schedule_stats.h +++ b/clients/drcachesim/tools/schedule_stats.h @@ -89,6 +89,9 @@ class schedule_stats_t : public analysis_tool_t { direct_switch_requests += rhs.direct_switch_requests; waits += rhs.waits; idles += rhs.idles; + idle_microseconds += rhs.idle_microseconds; + idle_micros_at_last_instr += rhs.idle_micros_at_last_instr; + cpu_microseconds += rhs.cpu_microseconds; for (const memref_tid_t tid : rhs.threads) { threads.insert(tid); } @@ -103,6 +106,9 @@ class schedule_stats_t : public analysis_tool_t { int64_t direct_switch_requests = 0; int64_t waits = 0; int64_t idles = 0; + uint64_t idle_microseconds = 0; + uint64_t idle_micros_at_last_instr = 0; + uint64_t cpu_microseconds = 0; std::unordered_set threads; }; counters_t @@ -124,11 +130,16 @@ class schedule_stats_t : public analysis_tool_t { uint64_t cur_segment_instrs = 0; bool prev_was_wait = false; bool prev_was_idle = false; + // Computing %-idle. + uint64_t segment_start_microseconds = 0; }; void print_counters(const counters_t &counters); + uint64_t + get_current_microseconds(); + uint64_t knob_print_every_ = 0; unsigned int knob_verbose_ = 0; // We use an ordered map to get our output in order. This table is not