From b2f27186d243d6afc2324c43b3bf9a9f681be474 Mon Sep 17 00:00:00 2001 From: Ryan Kuester Date: Sat, 14 Dec 2024 12:00:55 -0700 Subject: [PATCH] feat: enhance profiling and benchmarking (#3012) Modify `PrintMemoryPlan` in `greedy_memory_planner.cc` for better handling of tensor indices and scratch buffers. Fix `total_ticks_per_tag_` usage in `micro_profiler.cc` and add `ClearEvents` method. Update `Makefile.inc` and `generic_model_benchmark.cc` to support alternate memory regions and CRC32 checks for data integrity. Include compression data in `metrics.cc` allocation records and handle architecture-specific directives in `show_meta_data.cc.template`. BUG=see description --- .../memory_planner/greedy_memory_planner.cc | 22 ++- tensorflow/lite/micro/micro_profiler.cc | 19 +- tensorflow/lite/micro/micro_profiler.h | 6 +- .../micro/tools/benchmarking/Makefile.inc | 9 + .../tools/benchmarking/collect_meta_data.sh | 2 +- .../benchmarking/generic_model_benchmark.cc | 174 ++++++++++++++++-- .../lite/micro/tools/benchmarking/metrics.cc | 30 ++- .../benchmarking/show_meta_data.cc.template | 7 + 8 files changed, 228 insertions(+), 41 deletions(-) diff --git a/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc b/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc index a087b236cc9..62099308c7e 100644 --- a/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc +++ b/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc @@ -31,7 +31,7 @@ char GetOrdinalCharacter(int i) { } else if (i < 62) { return 'A' + (i - 36); } - return '*'; + return GetOrdinalCharacter(i % 62); } } // namespace @@ -335,9 +335,14 @@ void GreedyMemoryPlanner::PrintMemoryPlan() { CalculateOffsetsIfNeeded(); for (int i = 0; i < buffer_count_; ++i) { - MicroPrintf("%c (id=%d): size=%d, offset=%d, first_used=%d last_used=%d", - GetOrdinalCharacter(i), i, requirements_[i].size, - buffer_offsets_[i], requirements_[i].first_time_used, + char c = '*'; + if (requirements_[i].first_time_used != requirements_[i].last_time_used) { + // not a scratch buffer nor subgraph output tensor + c = GetOrdinalCharacter(i); + } + MicroPrintf("%c (id=%d): size=%d, offset=%d, first_used=%d last_used=%d", c, + i, requirements_[i].size, buffer_offsets_[i], + requirements_[i].first_time_used, requirements_[i].last_time_used); } @@ -379,7 +384,12 @@ void GreedyMemoryPlanner::PrintMemoryPlan() { const int line_end = ((offset + size) * kLineWidth) / max_size; for (int n = line_start; n < line_end; ++n) { if (line[n] == '.') { - line[n] = GetOrdinalCharacter(i); + if (requirements->first_time_used == requirements->last_time_used) { + // scratch buffer or subgraph output tensor + line[n] = '*'; + } else { + line[n] = GetOrdinalCharacter(i); + } } else { line[n] = '!'; } @@ -387,7 +397,7 @@ void GreedyMemoryPlanner::PrintMemoryPlan() { } line[kLineWidth] = 0; - MicroPrintf("%s%d: %s (%dk)", t < 10 ? " " : "", t, (const char*)line, + MicroPrintf("%4d: %s (%dk)", t, (const char*)line, (memory_use + 1023) / 1024); } } diff --git a/tensorflow/lite/micro/micro_profiler.cc b/tensorflow/lite/micro/micro_profiler.cc index ebead51a90d..e349bf73668 100644 --- a/tensorflow/lite/micro/micro_profiler.cc +++ b/tensorflow/lite/micro/micro_profiler.cc @@ -86,14 +86,14 @@ void MicroProfiler::LogTicksPerTagCsv() { TFLITE_DCHECK(tags_[i] != nullptr); int position = FindExistingOrNextPosition(tags_[i]); TFLITE_DCHECK(position >= 0); - total_ticks_per_tag[position].tag = tags_[i]; - total_ticks_per_tag[position].ticks = - total_ticks_per_tag[position].ticks + ticks; + total_ticks_per_tag_[position].tag = tags_[i]; + total_ticks_per_tag_[position].ticks = + total_ticks_per_tag_[position].ticks + ticks; total_ticks += ticks; } for (int i = 0; i < num_events_; ++i) { - TicksPerTag each_tag_entry = total_ticks_per_tag[i]; + TicksPerTag each_tag_entry = total_ticks_per_tag_[i]; if (each_tag_entry.tag == nullptr) { break; } @@ -112,7 +112,7 @@ void MicroProfiler::LogTicksPerTagCsv() { int MicroProfiler::FindExistingOrNextPosition(const char* tag_name) { int pos = 0; for (; pos < num_events_; pos++) { - TicksPerTag each_tag_entry = total_ticks_per_tag[pos]; + TicksPerTag each_tag_entry = total_ticks_per_tag_[pos]; if (each_tag_entry.tag == nullptr || strcmp(each_tag_entry.tag, tag_name) == 0) { return pos; @@ -120,4 +120,13 @@ int MicroProfiler::FindExistingOrNextPosition(const char* tag_name) { } return pos < num_events_ ? pos : -1; } + +void MicroProfiler::ClearEvents() { + for (int i = 0; i < num_events_; i++) { + total_ticks_per_tag_[i].tag = nullptr; + } + + num_events_ = 0; +} + } // namespace tflite diff --git a/tensorflow/lite/micro/micro_profiler.h b/tensorflow/lite/micro/micro_profiler.h index b52ebcb4ea9..fd8bc42ffd4 100644 --- a/tensorflow/lite/micro/micro_profiler.h +++ b/tensorflow/lite/micro/micro_profiler.h @@ -1,4 +1,4 @@ -/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -45,7 +45,7 @@ class MicroProfiler : public MicroProfilerInterface { virtual void EndEvent(uint32_t event_handle) override; // Clears all the events that have been currently profiled. - void ClearEvents() { num_events_ = 0; } + void ClearEvents(); // Returns the sum of the ticks taken across all the events. This number // is only meaningful if all of the events are disjoint (the end time of @@ -83,7 +83,7 @@ class MicroProfiler : public MicroProfilerInterface { // In practice, the number of tags will be much lower than the number of // events. But it is theoretically possible that each event to be unique and // hence we allow total_ticks_per_tag to have kMaxEvents entries. - TicksPerTag total_ticks_per_tag[kMaxEvents] = {}; + TicksPerTag total_ticks_per_tag_[kMaxEvents] = {}; int FindExistingOrNextPosition(const char* tag_name); diff --git a/tensorflow/lite/micro/tools/benchmarking/Makefile.inc b/tensorflow/lite/micro/tools/benchmarking/Makefile.inc index 396e7016384..a79420cb982 100644 --- a/tensorflow/lite/micro/tools/benchmarking/Makefile.inc +++ b/tensorflow/lite/micro/tools/benchmarking/Makefile.inc @@ -20,6 +20,15 @@ endif $(GENERATED_SRCS_DIR)$(GENERIC_BENCHMARK_MODEL_DIR)$(GENERIC_BENCHMARK_MODEL_NAME)_model_data.h endif +ifeq ($(ENABLE_COMPRESSION), yes) +ifneq ($(GENERIC_BENCHMARK_ALT_MEM_ATTR),) + CXXFLAGS += -DGENERIC_BENCHMARK_ALT_MEM_ATTR=$(GENERIC_BENCHMARK_ALT_MEM_ATTR) +endif +ifneq ($(GENERIC_BENCHMARK_ALT_MEM_SIZE),) + CXXFLAGS += -DGENERIC_BENCHMARK_ALT_MEM_SIZE=$(GENERIC_BENCHMARK_ALT_MEM_SIZE) +endif +endif + GENERIC_BENCHMARK_SRCS := \ $(MICROLITE_BENCHMARK_ROOT_DIR)/generic_model_benchmark.cc \ $(MICROLITE_BENCHMARK_ROOT_DIR)/metrics.cc \ diff --git a/tensorflow/lite/micro/tools/benchmarking/collect_meta_data.sh b/tensorflow/lite/micro/tools/benchmarking/collect_meta_data.sh index c60bdf3ed72..424a1b8da65 100755 --- a/tensorflow/lite/micro/tools/benchmarking/collect_meta_data.sh +++ b/tensorflow/lite/micro/tools/benchmarking/collect_meta_data.sh @@ -52,7 +52,7 @@ function substitute_strings() { IFS=${SAVED_IFS} replacement=() for line in "${lines_array[@]}"; do - line=$(sed -e 's/"/\\"/g' <<< "${line}") + line=$(sed -e 's/\\/\\\\/g' -e 's/"/\\"/g' <<< "${line}") line=$(printf '"%s",\n ' "${line}") replacement+=( "${line}" ) done diff --git a/tensorflow/lite/micro/tools/benchmarking/generic_model_benchmark.cc b/tensorflow/lite/micro/tools/benchmarking/generic_model_benchmark.cc index f398963a00d..0f58219644b 100644 --- a/tensorflow/lite/micro/tools/benchmarking/generic_model_benchmark.cc +++ b/tensorflow/lite/micro/tools/benchmarking/generic_model_benchmark.cc @@ -18,6 +18,7 @@ limitations under the License. #include #include +#include #include #include #include @@ -56,19 +57,37 @@ limitations under the License. #endif // defind(GENERIC_BENCHMARK_USING_BUILTIN_MODEL) +#if defined(GENERIC_BENCHMARK_ALT_MEM_ATTR) && \ + !defined(GENERIC_BENCHMARK_ALT_MEM_SIZE) +#error "GENERIC_BENCHMARK_ALT_MEM_SIZE missing from CXXFLAGS" +#endif // defined(GENERIC_BENCHMARK_ALT_MEM_ATTR) && + // !defined(GENERIC_BENCHMARK_ALT_MEM_SIZE) + +#if defined(GENERIC_BENCHMARK_ALT_MEM_SIZE) && \ + !defined(GENERIC_BENCHMARK_ALT_MEM_ATTR) +#error "GENERIC_BENCHMARK_ALT_MEM_ATTR missing from CXXFLAGS" +#endif // defined(GENERIC_BENCHMARK_ALT_MEM_SIZE) && + // !defined(GENERIC_BENCHMARK_ALT_MEM_ATTR) + +#if defined(GENERIC_BENCHMARK_ALT_MEM_SIZE) && \ + defined(GENERIC_BENCHMARK_ALT_MEM_ATTR) && defined(USE_TFLM_COMPRESSION) +#define USE_ALT_DECOMPRESSION_MEM +#endif // defined(GENERIC_BENCHMARK_ALT_MEM_SIZE) && + // defined(GENERIC_BENCHMARK_ALT_MEM_ATTR) && + // defined(USE_TFLM_COMPRESSION) + /* - * Generic model benchmark. Evaluates runtime performance of a provided model - * with random inputs. + * Generic model benchmark. Evaluates runtime performance of a provided + * model with random inputs. */ namespace tflite { - namespace { using Profiler = ::tflite::MicroProfiler; -// Seed used for the random input. Input data shouldn't affect invocation timing -// so randomness isn't really needed. +// Seed used for the random input. Input data shouldn't affect invocation +// timing so randomness isn't really needed. constexpr uint32_t kRandomSeed = 0xFB; #if !defined(GENERIC_BENCHMARK_USING_BUILTIN_MODEL) @@ -80,6 +99,11 @@ constexpr size_t kTensorArenaSize = GENERIC_BENCHMARK_TENSOR_ARENA_SIZE; constexpr size_t kTensorArenaSize = 5e6 - MODEL_SIZE; #endif // !defined(GENERIC_BENCHMARK_USING_BUILTIN_MODEL) +#if defined(USE_ALT_DECOMPRESSION_MEM) +constexpr size_t kAltMemorySize = GENERIC_BENCHMARK_ALT_MEM_SIZE; +alignas(16) GENERIC_BENCHMARK_ALT_MEM_ATTR uint8_t g_alt_memory[kAltMemorySize]; +#endif // defined(USE_ALT_DECOMPRESSION_MEM) + constexpr int kNumResourceVariable = 100; void SetRandomInput(const uint32_t random_seed, @@ -130,39 +154,146 @@ bool ReadFile(const char* file_name, void* buffer, size_t buffer_size) { } #endif // !defined(GENERIC_BENCHMARK_USING_BUILTIN_MODEL) +constexpr uint32_t kCrctabLen = 256; +uint32_t crctab[kCrctabLen]; + +void GenCRC32Table() { + constexpr uint32_t kPolyN = 0xEDB88320; + for (size_t index = 0; index < kCrctabLen; index++) { + crctab[index] = index; + for (int i = 0; i < 8; i++) { + if (crctab[index] & 1) { + crctab[index] = (crctab[index] >> 1) ^ kPolyN; + } else { + crctab[index] >>= 1; + } + } + } +} + +uint32_t ComputeCRC32(const uint8_t* data, const size_t data_length) { + uint32_t crc32 = ~0U; + + for (size_t i = 0; i < data_length; i++) { + // crctab is an array of 256 32-bit constants + const uint32_t index = (crc32 ^ data[i]) & (kCrctabLen - 1); + crc32 = (crc32 >> 8) ^ crctab[index]; + } + + // invert all bits of result + crc32 ^= ~0U; + return crc32; +} + +void ShowOutputCRC32(tflite::MicroInterpreter* interpreter) { + GenCRC32Table(); + for (size_t i = 0; i < interpreter->outputs_size(); ++i) { + TfLiteTensor* output = interpreter->output_tensor(i); + uint8_t* output_values = tflite::GetTensorData(output); + uint32_t crc32_value = ComputeCRC32(output_values, output->bytes); + MicroPrintf("Output CRC32: 0x%X", crc32_value); + } +} + +void ShowInputCRC32(tflite::MicroInterpreter* interpreter) { + GenCRC32Table(); + for (size_t i = 0; i < interpreter->inputs_size(); ++i) { + TfLiteTensor* input = interpreter->input_tensor(i); + uint8_t* input_values = tflite::GetTensorData(input); + uint32_t crc32_value = ComputeCRC32(input_values, input->bytes); + MicroPrintf("Input CRC32: 0x%X", crc32_value); + } +} + int Benchmark(const uint8_t* model_data, tflite::PrettyPrintType print_type) { - Profiler profiler; + static Profiler profiler; + static Profiler profiler2; + TfLiteStatus status; + +// use this to keep the application size stable regardless of whether +// compression is being used +#ifdef USE_TFLM_COMPRESSION + constexpr bool using_compression = true; +#else // USE_TFLM_COMPRESSION + constexpr bool using_compression = false; +#endif // USE_TFLM_COMPRESSION + alignas(16) static uint8_t tensor_arena[kTensorArenaSize]; - uint32_t event_handle = profiler.BeginEvent("TfliteGetModel"); +#ifdef USE_ALT_DECOMPRESSION_MEM + std::initializer_list + alt_memory_region = {{g_alt_memory, kAltMemorySize}}; +#endif // USE_ALT_DECOMPRESSION_MEM + + uint32_t event_handle = profiler.BeginEvent("tflite::GetModel"); const tflite::Model* model = tflite::GetModel(model_data); profiler.EndEvent(event_handle); + event_handle = profiler.BeginEvent("tflite::CreateOpResolver"); TflmOpResolver op_resolver; - TF_LITE_ENSURE_STATUS(CreateOpResolver(op_resolver)); + status = CreateOpResolver(op_resolver); + if (status != kTfLiteOk) { + MicroPrintf("tflite::CreateOpResolver failed"); + return -1; + } + profiler.EndEvent(event_handle); + event_handle = profiler.BeginEvent("tflite::RecordingMicroAllocator::Create"); tflite::RecordingMicroAllocator* allocator( tflite::RecordingMicroAllocator::Create(tensor_arena, kTensorArenaSize)); + profiler.EndEvent(event_handle); + event_handle = profiler.BeginEvent("tflite::MicroInterpreter instantiation"); tflite::RecordingMicroInterpreter interpreter( model, op_resolver, allocator, tflite::MicroResourceVariables::Create(allocator, kNumResourceVariable), &profiler); - TF_LITE_ENSURE_STATUS(interpreter.AllocateTensors()); + profiler.EndEvent(event_handle); + +#ifdef USE_ALT_DECOMPRESSION_MEM + event_handle = + profiler.BeginEvent("tflite::MicroInterpreter::SetDecompressionMemory"); + status = interpreter.SetDecompressionMemory(alt_memory_region); + if (status != kTfLiteOk) { + MicroPrintf("tflite::MicroInterpreter::SetDecompressionMemory failed"); + return -1; + } + profiler.EndEvent(event_handle); +#endif // USE_ALT_DECOMPRESSION_MEM + + event_handle = + profiler.BeginEvent("tflite::MicroInterpreter::AllocateTensors"); + status = interpreter.AllocateTensors(); + if (status != kTfLiteOk) { + MicroPrintf("tflite::MicroInterpreter::AllocateTensors failed"); + return -1; + } + profiler.EndEvent(event_handle); - profiler.Log(); + profiler.LogTicksPerTagCsv(); profiler.ClearEvents(); + if (using_compression) { + status = interpreter.SetAlternateProfiler(&profiler2); + if (status != kTfLiteOk) { + MicroPrintf("tflite::MicroInterpreter::SetAlternateProfiler failed"); + return -1; + } + } + MicroPrintf(""); // null MicroPrintf serves as a newline. - // For streaming models, the interpreter will return kTfLiteAbort if the model - // does not yet have enough data to make an inference. As such, we need to - // invoke the interpreter multiple times until we either receive an error or - // kTfLiteOk. This loop also works for non-streaming models, as they'll just - // return kTfLiteOk after the first invocation. + // For streaming models, the interpreter will return kTfLiteAbort if the + // model does not yet have enough data to make an inference. As such, we + // need to invoke the interpreter multiple times until we either receive an + // error or kTfLiteOk. This loop also works for non-streaming models, as + // they'll just return kTfLiteOk after the first invocation. uint32_t seed = kRandomSeed; while (true) { SetRandomInput(seed++, interpreter); - TfLiteStatus status = interpreter.Invoke(); + ShowInputCRC32(&interpreter); + MicroPrintf(""); // null MicroPrintf serves as a newline. + + status = interpreter.Invoke(); if ((status != kTfLiteOk) && (static_cast(status) != kTfLiteAbort)) { MicroPrintf("Model interpreter invocation failed: %d\n", status); return -1; @@ -174,6 +305,17 @@ int Benchmark(const uint8_t* model_data, tflite::PrettyPrintType print_type) { MicroPrintf(""); // null MicroPrintf serves as a newline. profiler.ClearEvents(); + if (using_compression) { + profiler2.Log(); + MicroPrintf(""); // null MicroPrintf serves as a newline. + profiler2.LogTicksPerTagCsv(); + MicroPrintf(""); // null MicroPrintf serves as a newline. + profiler2.ClearEvents(); + } + + ShowOutputCRC32(&interpreter); + MicroPrintf(""); // null MicroPrintf serves as a newline. + if (status == kTfLiteOk) { break; } diff --git a/tensorflow/lite/micro/tools/benchmarking/metrics.cc b/tensorflow/lite/micro/tools/benchmarking/metrics.cc index 3a4bf7e4917..f71a4cd139e 100644 --- a/tensorflow/lite/micro/tools/benchmarking/metrics.cc +++ b/tensorflow/lite/micro/tools/benchmarking/metrics.cc @@ -1,4 +1,4 @@ -/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -46,7 +46,8 @@ struct LogAllocationRecord { constexpr int kArenaRows = 3; constexpr int kArenaColumns = 3; -constexpr int kAllocationTypes = 7; +constexpr int kAllocationTypes = + static_cast(tflite::RecordedAllocationType::kNumAllocationTypes); constexpr int kAllocationColumns = 6; constexpr int kMaxBufSize = 100; @@ -85,16 +86,25 @@ LogAllocationRecord GetLogAllocationRecord( tflite::RecordedAllocationType::kPersistentBufferData, tflite::RecordedAllocationType::kTfLiteTensorVariableBufferData, tflite::RecordedAllocationType::kNodeAndRegistrationArray, - tflite::RecordedAllocationType::kOpData}; + tflite::RecordedAllocationType::kOpData, +#ifdef USE_TFLM_COMPRESSION + tflite::RecordedAllocationType::kCompressionData, +#endif // USE_TFLM_COMPRESSION + }; static_assert(std::extent::value == kAllocationTypes, "kAllocationTypes mismatch"); - const char* titles[] = {"Eval tensor data", - "Persistent tensor data", - "Persistent quantization data", - "Persistent buffer data", - "Tensor variable buffer data", - "Node and registration array", - "Operation data"}; + const char* titles[] = { + "Eval tensor data", + "Persistent tensor data", + "Persistent quantization data", + "Persistent buffer data", + "Tensor variable buffer data", + "Node and registration array", + "Operation data", +#ifdef USE_TFLM_COMPRESSION + "Compression data", +#endif // USE_TFLM_COMPRESSION + }; static_assert(std::extent::value == kAllocationTypes, "kAllocationTypes mismatch"); const size_t total_bytes = diff --git a/tensorflow/lite/micro/tools/benchmarking/show_meta_data.cc.template b/tensorflow/lite/micro/tools/benchmarking/show_meta_data.cc.template index a2102a48e1c..8ec4e512f7a 100644 --- a/tensorflow/lite/micro/tools/benchmarking/show_meta_data.cc.template +++ b/tensorflow/lite/micro/tools/benchmarking/show_meta_data.cc.template @@ -20,6 +20,13 @@ limitations under the License. #include "tensorflow/lite/micro/micro_log.h" #include "tensorflow/lite/micro/tools/benchmarking/show_meta_data.h" +#ifndef XTENSA +#undef HIFI3 +#undef HIFI4 +#undef HIFI5 +#undef VISION_P6 +#endif // XTENSA + #if defined(HIFI3) || defined(HIFI4) || defined(HIFI5) #include "NatureDSP_Signal_id.h" #include "xa_nnlib_standards.h"