From bc965e942332e56a5d2f16db911adde59bc2be56 Mon Sep 17 00:00:00 2001
From: Wenbing Li <10278425+wenbingl@users.noreply.github.com>
Date: Fri, 18 Oct 2024 16:38:30 -0700
Subject: [PATCH] code refinement

---
 shared/api/image_transforms.hpp         |   2 +
 shared/api/image_transforms_mllama.hpp  | 169 +++++++++++++++++-------
 test/data/processor/load_ortx_tsdump.py |   4 +-
 test/test_pp_api.py                     |  51 +++----
 4 files changed, 149 insertions(+), 77 deletions(-)
diff --git a/shared/api/image_transforms.hpp b/shared/api/image_transforms.hpp
index 865a820b..da94bdfb 100644
--- a/shared/api/image_transforms.hpp
+++ b/shared/api/image_transforms.hpp
@@ -117,6 +117,8 @@ struct Resize {
         height_ = std::get<int64_t>(value);
       } else if (key == "width") {
         width_ = std::get<int64_t>(value);
+      } else if (key == "keep_aspect_ratio") {
+        keep_aspect_ratio_ = std::get<int64_t>(value) != 0;
       } else if (key == "interpolation") {
         interpolation_ = std::get<std::string>(value);
         if (InterpolationMethods().find(interpolation_) == InterpolationMethods().end()) {
diff --git a/shared/api/image_transforms_mllama.hpp b/shared/api/image_transforms_mllama.hpp
index 583bab5a..56df8969 100644
--- a/shared/api/image_transforms_mllama.hpp
+++ b/shared/api/image_transforms_mllama.hpp
@@ -11,8 +11,7 @@
 #include "image_transforms.hpp"
 
 struct Llama3ImageTransform {
-  static void SplitIntoTitles(const ortc::Tensor<float>& normalized_image,
-                              ortc::Tensor<float>& pixel_values,
+  static void SplitIntoTitles(const ortc::Tensor<float>& normalized_image, ortc::Tensor<float>& pixel_values,
                               int64_t tile_height, int64_t tile_width) {
     auto& shape = normalized_image.Shape();
     int64_t image_height = shape[0];
@@ -27,21 +26,21 @@ struct Llama3ImageTransform {
 
     auto p_normalized_image = normalized_image.Data();
     // shape (num_tiles_width * num_tiles_height, num_channels, tile_height, tile_width)
-    float* output_pixel = pixel_values.Allocate(
-      {num_tiles_height * num_tiles_width, num_channels, tile_height, tile_width});
+    float* output_pixel =
+        pixel_values.Allocate({num_tiles_height * num_tiles_width, num_channels, tile_height, tile_width});
 
     // From (image_height, image_width, num_channels)
     // Permute to (num_tiles_height, num_tiles_width, num_channels, tile_height, tile_width)
     for (int64_t i = 0; i < num_tiles_height; ++i) {
       for (int64_t j = 0; j < num_tiles_width; ++j) {
-          // convert to be channel first
+        // convert to be channel first
         for (int64_t k = 0; k < num_channels; ++k) {
           auto sub_index = image_1c_size * (i * num_tiles_width + j) * num_channels + image_1c_size * k;
           for (int64_t y = 0; y < tile_height; ++y) {
             for (int64_t x = 0; x < tile_width; ++x) {
               output_pixel[sub_index + y * tile_width + x] =
-                  p_normalized_image[
-                    (i * tile_height + y) * image_width * num_channels + (j * tile_width + x) * num_channels + k];
+                  p_normalized_image[(i * tile_height + y) * image_width * num_channels +
+                                     (j * tile_width + x) * num_channels + k];
             }
           }
         }
@@ -83,7 +82,7 @@ struct Llama3ImageTransform {
       return status;
     }
 
-    DumpTensorToFile(normalized_image, "normalized_image");
+    // DumpTensorToFile(normalized_image, "normalized_image");
 
     SplitIntoTitles(normalized_image, pixel_values, tile_size_.first, tile_size_.second);
 
@@ -118,6 +117,31 @@ struct Llama3ImageTransform {
     return aspect_ratios;
   }
 
+  /*
+    Calculates the new size of an image to fit within a canvas while maintaining aspect ratio.
+
+    This function calculates the optimal size for an image to fit within a canvas defined by
+    canvas_height and canvas_width, while ensuring that the image dimensions are not smaller than
+    tile_size. If the image is larger than the canvas, the returned size will fit within the canvas.
+    If the image already fits within the canvas, the size remains unchanged.
+    The aspect ratio of the original image is preserved.
+
+    Args:
+        image_height (`int`):
+            The height of the original image.
+        image_width (`int`):
+            The width of the original image.
+        canvas_height (`int`):
+            The height of the canvas.
+        canvas_width (`int`):
+            The width of the canvas.
+        tile_size (`int`):
+            The tile size.
+
+    Returns:
+        `Tuple[int, int]`: A tuple containing the new height and width of the image.
+
+  */
   static std::tuple<int64_t, int64_t> GetImageSizeFitToCanvas(int64_t image_height, int64_t image_width,
                                                               int64_t canvas_height, int64_t canvas_width,
                                                               int64_t tile_size) {
@@ -132,10 +156,10 @@ struct Llama3ImageTransform {
 
     if (scale_w < scale_h) {
       new_width = target_width;
-      new_height = std::min(static_cast<int64_t>(std::floor(image_height * scale_w)), target_height);
+      new_height = static_cast<int64_t>(std::round(image_height * scale_w));
     } else {
       new_height = target_height;
-      new_width = std::min(static_cast<int64_t>(std::floor(image_width * scale_h)), target_width);
+      new_width = static_cast<int64_t>(std::round(image_width * scale_h));
     }
 
     return std::make_tuple(new_height, new_width);
@@ -166,56 +190,96 @@ struct Llama3ImageTransform {
     return aspect_ratio_mask;
   }
 
+  /*
+  Determines the best canvas based on image and tile size and maximum number of tiles.
+
+  First, calculates possible resolutions based on the maximum number of tiles and tile size.
+  For example for max_image_tiles=2, tile_size=100, possible tile arrangements are:
+  [(1, 1), (1, 2), (2, 1)] and corresponding canvas sizes are:
+  [(100, 100), (100, 200), (200, 100)]
+
+  For each possible resolution, calculates the scaling factors for
+  width and height, and selects the smallest one, which is the limiting side.
+  E.g. to match the canvas you can upscale height by 2x, and width by 1.5x,
+  therefore, the maximum upscaling you can do is min(2, 1.5) = 1.5.
+
+  If upscaling is possible (any of the scaling factors is greater than 1),
+  then picks the smallest upscaling factor > 1.
+
+  If upscaling is not possible, then picks the largest scaling factor <= 1, i.e.
+  reduce downscaling as much as possible.
+
+  If there are multiple resolutions with the same max scale, we pick the one with the lowest area,
+  to minimize padding. E.g., the same image can be upscaled to 224x224 and 224x448, but the latter
+  has more padding.
+
+  Args:
+      image_height (`int`):
+          The height of the image.
+      image_width (`int`):
+          The width of the image.
+      max_image_tiles (`int`):
+          The maximum number of tiles any image can be split into.
+      tile_size (`int`):
+          The tile size.
+
+  Returns:
+      `pair[int, int]`: The best canvas resolution [height, width] for the given image.
+  */
   static std::pair<int64_t, int64_t> GetOptimalTiledCanvas(int64_t image_height, int64_t image_width,
                                                            int64_t max_image_tiles, int64_t tile_size) {
-    auto possible_tile_arrangements = GetAllSupportedAspectRatios(max_image_tiles);
-    std::vector<std::pair<int64_t, int64_t>> possible_canvas_sizes;
+    {
+      auto possible_tile_arrangements = GetAllSupportedAspectRatios(max_image_tiles);
+      std::vector<std::pair<int, int>> possible_canvas_sizes;
 
-    for (const auto& arrangement : possible_tile_arrangements) {
-      possible_canvas_sizes.emplace_back(arrangement.first * tile_size, arrangement.second * tile_size);
-    }
-
-    std::vector<double> scales;
-    std::vector<std::pair<int64_t, int64_t>> chosen_canvas;
-    double selected_scale;
-
-    for (const auto& canvas : possible_canvas_sizes) {
-      double scale_h = static_cast<double>(canvas.second) / image_height;
-      double scale_w = static_cast<double>(canvas.first) / image_width;
-      double scale = std::min(scale_h, scale_w);
-      scales.push_back(scale);
-    }
-
-    auto upscaling_it = std::find_if(scales.begin(), scales.end(), [](double scale) { return scale >= 1.0; });
+      for (const auto& arrangement : possible_tile_arrangements) {
+        possible_canvas_sizes.emplace_back(arrangement.first * tile_size, arrangement.second * tile_size);
+      }
 
-    if (upscaling_it != scales.end()) {
-      selected_scale = *std::min_element(upscaling_it, scales.end());
-    } else {
-      selected_scale = *std::max_element(scales.begin(), scales.end());
-    }
+      std::vector<double> scales;
+      for (const auto& size : possible_canvas_sizes) {
+        double scale_h = static_cast<double>(size.first) / image_height;
+        double scale_w = static_cast<double>(size.second) / image_width;
+        scales.push_back(std::min(scale_h, scale_w));
+      }
 
-    for (size_t i = 0; i < scales.size(); ++i) {
-      if (std::abs(scales[i] - selected_scale) < std::numeric_limits<double>::epsilon()) {
-        chosen_canvas.push_back(possible_canvas_sizes[i]);
+      double selected_scale = 0;
+      std::vector<double> upscaling_options;
+      for (double scale : scales) {
+        if (scale >= 1) {
+          upscaling_options.push_back(scale);
+        }
       }
-    }
 
-    std::pair<int64_t, int64_t> optimal_canvas;
+      if (!upscaling_options.empty()) {
+        selected_scale = *std::min_element(upscaling_options.begin(), upscaling_options.end());
+      } else {
+        std::vector<double> downscaling_options;
+        for (double scale : scales) {
+          if (scale < 1) {
+            downscaling_options.push_back(scale);
+          }
+        }
+        selected_scale = *std::max_element(downscaling_options.begin(), downscaling_options.end());
+      }
 
-    if (chosen_canvas.size() > 1) {
-      int64_t min_area = std::numeric_limits<int64_t>::max();
-      for (const auto& canvas : chosen_canvas) {
-        int64_t area = canvas.first * canvas.second;
-        if (area < min_area) {
-          min_area = area;
-          optimal_canvas = canvas;
+      std::vector<std::pair<int, int>> chosen_canvas;
+      for (size_t i = 0; i < scales.size(); ++i) {
+        if (std::abs(scales[i] - selected_scale) < 1e-9) {
+          chosen_canvas.push_back(possible_canvas_sizes[i]);
         }
       }
-    } else {
-      optimal_canvas = chosen_canvas[0];
-    }
 
-    return std::make_pair(optimal_canvas.second, optimal_canvas.first);
+      if (chosen_canvas.size() > 1) {
+        auto optimal_canvas = std::min_element(chosen_canvas.begin(), chosen_canvas.end(),
+                                               [](const std::pair<int, int>& a, const std::pair<int, int>& b) {
+                                                 return (a.first * a.second) < (b.first * b.second);
+                                               });
+        return *optimal_canvas;
+      } else {
+        return chosen_canvas[0];
+      }
+    }
   }
 
   static std::vector<int64_t> ConvertAspectRatiosToIds(const std::vector<std::pair<int64_t, int64_t>>& aspect_ratios,
@@ -269,9 +333,12 @@ struct Llama3ImageTransform {
     aspect_ratio = std::make_pair(num_tiles_height, num_tiles_width);
     auto [new_height, new_width] =
         GetImageSizeFitToCanvas(image_height, image_width, canvas_height, canvas_width, tile_size);
+
     Resize resizer;
-    std::unordered_map<std::string, ortx::AttrType> attrs = {
-        {"height", new_height}, {"width", new_width}, {"interpolation", std::string("LINEAR")}};
+    std::unordered_map<std::string, ortx::AttrType> attrs = {{"height", new_height},
+                                                             {"width", new_width},
+                                                             {"interpolation", std::string("LINEAR")},
+                                                             {"keep_aspect_ratio", int64_t(0)}};
     OrtxStatus status = resizer.Init(attrs);
     if (!status.IsOk()) {
       return status;
diff --git a/test/data/processor/load_ortx_tsdump.py b/test/data/processor/load_ortx_tsdump.py
index dcc35d3a..d2dfe129 100644
--- a/test/data/processor/load_ortx_tsdump.py
+++ b/test/data/processor/load_ortx_tsdump.py
@@ -4,7 +4,7 @@
 
 from PIL import Image
 
-dumping_file_path = "C:\\temp\\normalized_image822870921_f_1120.bin"
+dumping_file_path = "C:\\temp\\normalized_image826885234_f_560.bin"
 
 
 def regen_image(arr):
@@ -35,7 +35,7 @@ def regen_image(arr):
 raw_data = raw_data.reshape((image_height, image_width, 3))
 
 # from bgr to rgb
-raw_data = raw_data[:, :, ::-1]
+# raw_data = raw_data[:, :, ::-1]
 
 # save the image to disk
 if dtype == np.float32:
diff --git a/test/test_pp_api.py b/test/test_pp_api.py
index 8a7e35b0..4965f13f 100644
--- a/test/test_pp_api.py
+++ b/test/test_pp_api.py
@@ -50,10 +50,12 @@ def setUpClass(cls):
 
     def test_CLIP_image_processing(self):
         model_id = "openai/clip-vit-large-patch14"
+        image_list = ["test/data/processor/australia.jpg",
+                      "test/data/processor/passport.png",
+                      "test/data/processor/exceltable.png"]
+        (image, image2, image3) = [Image.open(f) for f in image_list]
+
         processor = AutoImageProcessor.from_pretrained(model_id)
-        image = Image.open("test/data/processor/australia.jpg")
-        image2 = Image.open("test/data/processor/passport.png")
-        image3 = Image.open("test/data/processor/exceltable.png")
         inputs = processor.preprocess(
             [image, image2, image3], return_tensors="np")
         print({k: v.shape if k == "pixel_values" else v for k, v in inputs.items()})
@@ -66,8 +68,7 @@ def test_CLIP_image_processing(self):
 
         ort_processor = pp_api.ImageProcessor(
             "test/data/processor/clip_image.json")
-        inputs = ort_processor.pre_process(
-            ["test/data/processor/australia.jpg", "test/data/processor/passport.png", "test/data/processor/exceltable.png"])
+        inputs = ort_processor.pre_process(image_list)
         print(ort_processor.to_numpy(inputs, 0).shape)
         actual_images = ort_processor.to_numpy(inputs, 0)
         for i in range(len(actual_images)):
@@ -77,8 +78,6 @@ def test_CLIP_image_processing(self):
 
     def test_llama3_2_image_processing(self):
         model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-        processor = AutoImageProcessor.from_pretrained(
-            model_id, token=TestPPAPI.token_id)
 
         url = ("https://huggingface.co/datasets/huggingface/"
                "documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg")
@@ -87,29 +86,33 @@ def test_llama3_2_image_processing(self):
             f.write(requests.get(url).content)
 
         # image = Image.open(requests.get(url, stream=True).raw)
-        image = Image.open(f"{self.temp_dir}/rabbit.jpg")
-        image2 = Image.open("test/data/processor/passport.png")
-        image3 = Image.open("test/data/processor/exceltable.png")
+        image_list = [f"{self.temp_dir}/rabbit.jpg",
+                      "test/data/processor/passport.png",
+                      "test/data/processor/exceltable.png"]
+        (image, image2, image3) = [Image.open(f) for f in image_list]
+
+        processor = AutoImageProcessor.from_pretrained(model_id, token=TestPPAPI.token_id)
         inputs = processor.preprocess(
             [image, image2, image3], return_tensors="np")
         print({k: v.shape if k == "pixel_values" else v for k, v in inputs.items()})
 
-        expected_images = inputs["pixel_values"][0][0]
-        for i in range(len(expected_images)):
-            expected = expected_images[i]
-            e_image = regen_image(np.transpose(expected, (1, 2, 0)))
-            e_image.save(f"{self.temp_dir}/e_{i}.png")
-
         ort_processor = pp_api.ImageProcessor(
             "test/data/processor/mllama/llama_3_image.json")
-        inputs = ort_processor.pre_process(
-            [f"{self.temp_dir}/rabbit.jpg", "test/data/processor/passport.png", "test/data/processor/exceltable.png"])
-        print(ort_processor.to_numpy(inputs, 0).shape)
-        actual_images = ort_processor.to_numpy(inputs, 0)[0]
-        for i in range(len(actual_images)):
-            actual = actual_images[i]
-            a_image = regen_image(np.transpose(actual, (1, 2, 0)))
-            a_image.save(f"{self.temp_dir}/a_{i}.png")
+        ort_inputs = ort_processor.to_numpy(ort_processor.pre_process(image_list), 0)
+        print(ort_inputs.shape)
+
+        for idx in range(len(image_list)):
+            expected_images = inputs["pixel_values"][0][idx]
+            for i in range(len(expected_images)):
+                expected = expected_images[i]
+                e_image = regen_image(np.transpose(expected, (1, 2, 0)))
+                e_image.save(f"{self.temp_dir}/e_{idx}_{i}.png")
+
+            actual_images = ort_inputs[idx]
+            for i in range(len(actual_images)):
+                actual = actual_images[i]
+                a_image = regen_image(np.transpose(actual, (1, 2, 0)))
+                a_image.save(f"{self.temp_dir}/a_{idx}_{i}.png")
 
 
 if __name__ == '__main__':