From bc965e942332e56a5d2f16db911adde59bc2be56 Mon Sep 17 00:00:00 2001 From: Wenbing Li <10278425+wenbingl@users.noreply.github.com> Date: Fri, 18 Oct 2024 16:38:30 -0700 Subject: [PATCH] code refinement --- shared/api/image_transforms.hpp | 2 + shared/api/image_transforms_mllama.hpp | 169 +++++++++++++++++------- test/data/processor/load_ortx_tsdump.py | 4 +- test/test_pp_api.py | 51 +++---- 4 files changed, 149 insertions(+), 77 deletions(-) diff --git a/shared/api/image_transforms.hpp b/shared/api/image_transforms.hpp index 865a820b..da94bdfb 100644 --- a/shared/api/image_transforms.hpp +++ b/shared/api/image_transforms.hpp @@ -117,6 +117,8 @@ struct Resize { height_ = std::get(value); } else if (key == "width") { width_ = std::get(value); + } else if (key == "keep_aspect_ratio") { + keep_aspect_ratio_ = std::get(value) != 0; } else if (key == "interpolation") { interpolation_ = std::get(value); if (InterpolationMethods().find(interpolation_) == InterpolationMethods().end()) { diff --git a/shared/api/image_transforms_mllama.hpp b/shared/api/image_transforms_mllama.hpp index 583bab5a..56df8969 100644 --- a/shared/api/image_transforms_mllama.hpp +++ b/shared/api/image_transforms_mllama.hpp @@ -11,8 +11,7 @@ #include "image_transforms.hpp" struct Llama3ImageTransform { - static void SplitIntoTitles(const ortc::Tensor& normalized_image, - ortc::Tensor& pixel_values, + static void SplitIntoTitles(const ortc::Tensor& normalized_image, ortc::Tensor& pixel_values, int64_t tile_height, int64_t tile_width) { auto& shape = normalized_image.Shape(); int64_t image_height = shape[0]; @@ -27,21 +26,21 @@ struct Llama3ImageTransform { auto p_normalized_image = normalized_image.Data(); // shape (num_tiles_width * num_tiles_height, num_channels, tile_height, tile_width) - float* output_pixel = pixel_values.Allocate( - {num_tiles_height * num_tiles_width, num_channels, tile_height, tile_width}); + float* output_pixel = + pixel_values.Allocate({num_tiles_height * num_tiles_width, num_channels, tile_height, tile_width}); // From (image_height, image_width, num_channels) // Permute to (num_tiles_height, num_tiles_width, num_channels, tile_height, tile_width) for (int64_t i = 0; i < num_tiles_height; ++i) { for (int64_t j = 0; j < num_tiles_width; ++j) { - // convert to be channel first + // convert to be channel first for (int64_t k = 0; k < num_channels; ++k) { auto sub_index = image_1c_size * (i * num_tiles_width + j) * num_channels + image_1c_size * k; for (int64_t y = 0; y < tile_height; ++y) { for (int64_t x = 0; x < tile_width; ++x) { output_pixel[sub_index + y * tile_width + x] = - p_normalized_image[ - (i * tile_height + y) * image_width * num_channels + (j * tile_width + x) * num_channels + k]; + p_normalized_image[(i * tile_height + y) * image_width * num_channels + + (j * tile_width + x) * num_channels + k]; } } } @@ -83,7 +82,7 @@ struct Llama3ImageTransform { return status; } - DumpTensorToFile(normalized_image, "normalized_image"); + // DumpTensorToFile(normalized_image, "normalized_image"); SplitIntoTitles(normalized_image, pixel_values, tile_size_.first, tile_size_.second); @@ -118,6 +117,31 @@ struct Llama3ImageTransform { return aspect_ratios; } + /* + Calculates the new size of an image to fit within a canvas while maintaining aspect ratio. + + This function calculates the optimal size for an image to fit within a canvas defined by + canvas_height and canvas_width, while ensuring that the image dimensions are not smaller than + tile_size. If the image is larger than the canvas, the returned size will fit within the canvas. + If the image already fits within the canvas, the size remains unchanged. + The aspect ratio of the original image is preserved. + + Args: + image_height (`int`): + The height of the original image. + image_width (`int`): + The width of the original image. + canvas_height (`int`): + The height of the canvas. + canvas_width (`int`): + The width of the canvas. + tile_size (`int`): + The tile size. + + Returns: + `Tuple[int, int]`: A tuple containing the new height and width of the image. + + */ static std::tuple GetImageSizeFitToCanvas(int64_t image_height, int64_t image_width, int64_t canvas_height, int64_t canvas_width, int64_t tile_size) { @@ -132,10 +156,10 @@ struct Llama3ImageTransform { if (scale_w < scale_h) { new_width = target_width; - new_height = std::min(static_cast(std::floor(image_height * scale_w)), target_height); + new_height = static_cast(std::round(image_height * scale_w)); } else { new_height = target_height; - new_width = std::min(static_cast(std::floor(image_width * scale_h)), target_width); + new_width = static_cast(std::round(image_width * scale_h)); } return std::make_tuple(new_height, new_width); @@ -166,56 +190,96 @@ struct Llama3ImageTransform { return aspect_ratio_mask; } + /* + Determines the best canvas based on image and tile size and maximum number of tiles. + + First, calculates possible resolutions based on the maximum number of tiles and tile size. + For example for max_image_tiles=2, tile_size=100, possible tile arrangements are: + [(1, 1), (1, 2), (2, 1)] and corresponding canvas sizes are: + [(100, 100), (100, 200), (200, 100)] + + For each possible resolution, calculates the scaling factors for + width and height, and selects the smallest one, which is the limiting side. + E.g. to match the canvas you can upscale height by 2x, and width by 1.5x, + therefore, the maximum upscaling you can do is min(2, 1.5) = 1.5. + + If upscaling is possible (any of the scaling factors is greater than 1), + then picks the smallest upscaling factor > 1. + + If upscaling is not possible, then picks the largest scaling factor <= 1, i.e. + reduce downscaling as much as possible. + + If there are multiple resolutions with the same max scale, we pick the one with the lowest area, + to minimize padding. E.g., the same image can be upscaled to 224x224 and 224x448, but the latter + has more padding. + + Args: + image_height (`int`): + The height of the image. + image_width (`int`): + The width of the image. + max_image_tiles (`int`): + The maximum number of tiles any image can be split into. + tile_size (`int`): + The tile size. + + Returns: + `pair[int, int]`: The best canvas resolution [height, width] for the given image. + */ static std::pair GetOptimalTiledCanvas(int64_t image_height, int64_t image_width, int64_t max_image_tiles, int64_t tile_size) { - auto possible_tile_arrangements = GetAllSupportedAspectRatios(max_image_tiles); - std::vector> possible_canvas_sizes; + { + auto possible_tile_arrangements = GetAllSupportedAspectRatios(max_image_tiles); + std::vector> possible_canvas_sizes; - for (const auto& arrangement : possible_tile_arrangements) { - possible_canvas_sizes.emplace_back(arrangement.first * tile_size, arrangement.second * tile_size); - } - - std::vector scales; - std::vector> chosen_canvas; - double selected_scale; - - for (const auto& canvas : possible_canvas_sizes) { - double scale_h = static_cast(canvas.second) / image_height; - double scale_w = static_cast(canvas.first) / image_width; - double scale = std::min(scale_h, scale_w); - scales.push_back(scale); - } - - auto upscaling_it = std::find_if(scales.begin(), scales.end(), [](double scale) { return scale >= 1.0; }); + for (const auto& arrangement : possible_tile_arrangements) { + possible_canvas_sizes.emplace_back(arrangement.first * tile_size, arrangement.second * tile_size); + } - if (upscaling_it != scales.end()) { - selected_scale = *std::min_element(upscaling_it, scales.end()); - } else { - selected_scale = *std::max_element(scales.begin(), scales.end()); - } + std::vector scales; + for (const auto& size : possible_canvas_sizes) { + double scale_h = static_cast(size.first) / image_height; + double scale_w = static_cast(size.second) / image_width; + scales.push_back(std::min(scale_h, scale_w)); + } - for (size_t i = 0; i < scales.size(); ++i) { - if (std::abs(scales[i] - selected_scale) < std::numeric_limits::epsilon()) { - chosen_canvas.push_back(possible_canvas_sizes[i]); + double selected_scale = 0; + std::vector upscaling_options; + for (double scale : scales) { + if (scale >= 1) { + upscaling_options.push_back(scale); + } } - } - std::pair optimal_canvas; + if (!upscaling_options.empty()) { + selected_scale = *std::min_element(upscaling_options.begin(), upscaling_options.end()); + } else { + std::vector downscaling_options; + for (double scale : scales) { + if (scale < 1) { + downscaling_options.push_back(scale); + } + } + selected_scale = *std::max_element(downscaling_options.begin(), downscaling_options.end()); + } - if (chosen_canvas.size() > 1) { - int64_t min_area = std::numeric_limits::max(); - for (const auto& canvas : chosen_canvas) { - int64_t area = canvas.first * canvas.second; - if (area < min_area) { - min_area = area; - optimal_canvas = canvas; + std::vector> chosen_canvas; + for (size_t i = 0; i < scales.size(); ++i) { + if (std::abs(scales[i] - selected_scale) < 1e-9) { + chosen_canvas.push_back(possible_canvas_sizes[i]); } } - } else { - optimal_canvas = chosen_canvas[0]; - } - return std::make_pair(optimal_canvas.second, optimal_canvas.first); + if (chosen_canvas.size() > 1) { + auto optimal_canvas = std::min_element(chosen_canvas.begin(), chosen_canvas.end(), + [](const std::pair& a, const std::pair& b) { + return (a.first * a.second) < (b.first * b.second); + }); + return *optimal_canvas; + } else { + return chosen_canvas[0]; + } + } } static std::vector ConvertAspectRatiosToIds(const std::vector>& aspect_ratios, @@ -269,9 +333,12 @@ struct Llama3ImageTransform { aspect_ratio = std::make_pair(num_tiles_height, num_tiles_width); auto [new_height, new_width] = GetImageSizeFitToCanvas(image_height, image_width, canvas_height, canvas_width, tile_size); + Resize resizer; - std::unordered_map attrs = { - {"height", new_height}, {"width", new_width}, {"interpolation", std::string("LINEAR")}}; + std::unordered_map attrs = {{"height", new_height}, + {"width", new_width}, + {"interpolation", std::string("LINEAR")}, + {"keep_aspect_ratio", int64_t(0)}}; OrtxStatus status = resizer.Init(attrs); if (!status.IsOk()) { return status; diff --git a/test/data/processor/load_ortx_tsdump.py b/test/data/processor/load_ortx_tsdump.py index dcc35d3a..d2dfe129 100644 --- a/test/data/processor/load_ortx_tsdump.py +++ b/test/data/processor/load_ortx_tsdump.py @@ -4,7 +4,7 @@ from PIL import Image -dumping_file_path = "C:\\temp\\normalized_image822870921_f_1120.bin" +dumping_file_path = "C:\\temp\\normalized_image826885234_f_560.bin" def regen_image(arr): @@ -35,7 +35,7 @@ def regen_image(arr): raw_data = raw_data.reshape((image_height, image_width, 3)) # from bgr to rgb -raw_data = raw_data[:, :, ::-1] +# raw_data = raw_data[:, :, ::-1] # save the image to disk if dtype == np.float32: diff --git a/test/test_pp_api.py b/test/test_pp_api.py index 8a7e35b0..4965f13f 100644 --- a/test/test_pp_api.py +++ b/test/test_pp_api.py @@ -50,10 +50,12 @@ def setUpClass(cls): def test_CLIP_image_processing(self): model_id = "openai/clip-vit-large-patch14" + image_list = ["test/data/processor/australia.jpg", + "test/data/processor/passport.png", + "test/data/processor/exceltable.png"] + (image, image2, image3) = [Image.open(f) for f in image_list] + processor = AutoImageProcessor.from_pretrained(model_id) - image = Image.open("test/data/processor/australia.jpg") - image2 = Image.open("test/data/processor/passport.png") - image3 = Image.open("test/data/processor/exceltable.png") inputs = processor.preprocess( [image, image2, image3], return_tensors="np") print({k: v.shape if k == "pixel_values" else v for k, v in inputs.items()}) @@ -66,8 +68,7 @@ def test_CLIP_image_processing(self): ort_processor = pp_api.ImageProcessor( "test/data/processor/clip_image.json") - inputs = ort_processor.pre_process( - ["test/data/processor/australia.jpg", "test/data/processor/passport.png", "test/data/processor/exceltable.png"]) + inputs = ort_processor.pre_process(image_list) print(ort_processor.to_numpy(inputs, 0).shape) actual_images = ort_processor.to_numpy(inputs, 0) for i in range(len(actual_images)): @@ -77,8 +78,6 @@ def test_CLIP_image_processing(self): def test_llama3_2_image_processing(self): model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct" - processor = AutoImageProcessor.from_pretrained( - model_id, token=TestPPAPI.token_id) url = ("https://huggingface.co/datasets/huggingface/" "documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg") @@ -87,29 +86,33 @@ def test_llama3_2_image_processing(self): f.write(requests.get(url).content) # image = Image.open(requests.get(url, stream=True).raw) - image = Image.open(f"{self.temp_dir}/rabbit.jpg") - image2 = Image.open("test/data/processor/passport.png") - image3 = Image.open("test/data/processor/exceltable.png") + image_list = [f"{self.temp_dir}/rabbit.jpg", + "test/data/processor/passport.png", + "test/data/processor/exceltable.png"] + (image, image2, image3) = [Image.open(f) for f in image_list] + + processor = AutoImageProcessor.from_pretrained(model_id, token=TestPPAPI.token_id) inputs = processor.preprocess( [image, image2, image3], return_tensors="np") print({k: v.shape if k == "pixel_values" else v for k, v in inputs.items()}) - expected_images = inputs["pixel_values"][0][0] - for i in range(len(expected_images)): - expected = expected_images[i] - e_image = regen_image(np.transpose(expected, (1, 2, 0))) - e_image.save(f"{self.temp_dir}/e_{i}.png") - ort_processor = pp_api.ImageProcessor( "test/data/processor/mllama/llama_3_image.json") - inputs = ort_processor.pre_process( - [f"{self.temp_dir}/rabbit.jpg", "test/data/processor/passport.png", "test/data/processor/exceltable.png"]) - print(ort_processor.to_numpy(inputs, 0).shape) - actual_images = ort_processor.to_numpy(inputs, 0)[0] - for i in range(len(actual_images)): - actual = actual_images[i] - a_image = regen_image(np.transpose(actual, (1, 2, 0))) - a_image.save(f"{self.temp_dir}/a_{i}.png") + ort_inputs = ort_processor.to_numpy(ort_processor.pre_process(image_list), 0) + print(ort_inputs.shape) + + for idx in range(len(image_list)): + expected_images = inputs["pixel_values"][0][idx] + for i in range(len(expected_images)): + expected = expected_images[i] + e_image = regen_image(np.transpose(expected, (1, 2, 0))) + e_image.save(f"{self.temp_dir}/e_{idx}_{i}.png") + + actual_images = ort_inputs[idx] + for i in range(len(actual_images)): + actual = actual_images[i] + a_image = regen_image(np.transpose(actual, (1, 2, 0))) + a_image.save(f"{self.temp_dir}/a_{idx}_{i}.png") if __name__ == '__main__':