pytorch · q10 · Sep 26, 2024 · Sep 26, 2024 · Sep 26, 2024 · Sep 26, 2024
diff --git a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_cpu_template.cpp b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_cpu_template.cpp
@@ -41,6 +41,16 @@ inline uint32_t pruned_hash_function(uint32_t h) {
     return h;
 }
 
+inline uint64_t pruned_hash_function(uint64_t k) {
+  // MurmorHash3 64-bit mixing function.
+  k ^= k >> 33;
+  k *= (0xff51afd7ed558ccd);
+  k ^= k >> 33;
+  k *= (0xc4ceb9fe1a85ec53);
+  k ^= k >> 33;
+  return k;
+}
+
 } // namespace
 
 void pruned_hashmap_insert_{{ wdesc }}_cpu(
@@ -404,58 +414,72 @@ Tensor pruned_hashmap_lookup_{{ wdesc }}_cpu(
     TENSOR_ON_CPU(offsets);
     TENSOR_ON_CPU(hash_table);
     TENSOR_ON_CPU(hash_table_offsets);
+    TENSORS_HAVE_SAME_SCALAR_TYPE(indices, offsets, hash_table);
 
     int32_t T = hash_table_offsets.size(0) - 1;
     int32_t B = (offsets.size(0) - 1) / T;
     TORCH_CHECK(B > 0);
+
     auto dense_indices = empty_like(indices);
-    const auto* indices_acc = indices.data_ptr<int32_t>();
-    auto* dense_indices_acc = dense_indices.data_ptr<int32_t>();
 
-    const auto* offsets_acc = offsets.data_ptr<int32_t>();
-    const auto hash_table_acc = hash_table.accessor<int32_t, 2>();
-    const auto hash_table_offsets_acc = hash_table_offsets.accessor<int64_t, 1>();
-for (const auto t : c10::irange(T)) {
-        int64_t table_start = hash_table_offsets_acc[t];
-        int64_t table_end = hash_table_offsets_acc[t + 1];
-        int64_t capacity = table_end - table_start;
-for (const auto b : c10::irange(B)) {
-            int32_t indices_start = offsets_acc[t * B + b];
-            int32_t indices_end = offsets_acc[t * B + b + 1];
-            int32_t L = indices_end - indices_start;
+    AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "pruned_hashmap_lookup_{{ wdesc }}_cpu", [&] {
+        using hash_t =
+            std::conditional_t<std::is_same_v<index_t, int64_t>, uint64_t, uint32_t>;
 
-            if (table_start == table_end) {
-for (const auto l : c10::irange(L)) {
-                    dense_indices_acc[indices_start + l] = indices_acc[indices_start + l];
-                }
-            } else {
-for (const auto l : c10::irange(L)) {
-                    int32_t idx = indices_acc[indices_start + l];
-                    uint32_t slot = pruned_hash_function(static_cast<uint32_t>(idx)) % capacity;
-                    while (true) {
-                        int32_t slot_sparse_idx = hash_table_acc[table_start + static_cast<int64_t>(slot)][0];
-
-                        // empty slot
-                        if (slot_sparse_idx == -1) {
-                            dense_indices_acc[indices_start + l] = -1;
-                            break;
-                        }
-                        // already exists
-                        if (slot_sparse_idx == idx) {
-                            dense_indices_acc[indices_start + l] = hash_table_acc[table_start + static_cast<int64_t>(slot)][1];
-                            break;
+        const auto* indices_acc = indices.data_ptr<index_t>();
+        auto* dense_indices_acc = dense_indices.data_ptr<index_t>();
+
+        const auto* offsets_acc = offsets.data_ptr<index_t>();
+        const auto hash_table_acc = hash_table.accessor<index_t, 2>();
+        const auto hash_table_offsets_acc = hash_table_offsets.accessor<int64_t, 1>();
+
+        for (const auto t : c10::irange(T)) {
+            const auto table_start = hash_table_offsets_acc[t];
+            const auto table_end = hash_table_offsets_acc[t + 1];
+            const auto capacity = table_end - table_start;
+
+            for (const auto b : c10::irange(B)) {
+                const auto indices_start = offsets_acc[t * B + b];
+                const auto indices_end = offsets_acc[t * B + b + 1];
+                const auto L = indices_end - indices_start;
+
+                if (table_start == table_end) {
+                    for (const auto l : c10::irange(L)) {
+                        dense_indices_acc[indices_start + l] = indices_acc[indices_start + l];
+                    }
+
+                } else {
+                    for (const auto l : c10::irange(L)) {
+                        const auto idx = indices_acc[indices_start + l];
+                        auto slot = pruned_hash_function(static_cast<hash_t>(idx)) % capacity;
+
+                        while (true) {
+                            const auto slot_sparse_idx = hash_table_acc[table_start + static_cast<int64_t>(slot)][0];
+
+                            // empty slot
+                            if (slot_sparse_idx == -1) {
+                                dense_indices_acc[indices_start + l] = -1;
+                                break;
+                            }
+                            // already exists
+                            if (slot_sparse_idx == idx) {
+                                dense_indices_acc[indices_start + l] = hash_table_acc[table_start + static_cast<int64_t>(slot)][1];
+                                break;
+                            }
+                            // linear probe
+                            slot = (slot + 1) % capacity;
                         }
-                        // linear probe
-                        slot = (slot + 1) % capacity;
                     }
                 }
             }
         }
-    }
+    });
+
     return dense_indices;
 }
 
 {% if not weighted %}
+
 Tensor pruned_array_lookup_cpu(
     Tensor indices,
     Tensor offsets,
@@ -465,37 +489,46 @@ Tensor pruned_array_lookup_cpu(
     TENSOR_ON_CPU(offsets);
     TENSOR_ON_CPU(index_remappings);
     TENSOR_ON_CPU(index_remappings_offsets);
+    TENSORS_HAVE_SAME_SCALAR_TYPE(indices, offsets, index_remappings);
 
     int32_t T = index_remappings_offsets.size(0) - 1;
     int32_t B = (offsets.size(0) - 1) / T;
     TORCH_CHECK(B > 0);
+
     auto dense_indices = empty_like(indices);
-    const auto* indices_acc = indices.data_ptr<int32_t>();
-    auto* dense_indices_acc = dense_indices.data_ptr<int32_t>();
-    const auto* offsets_acc = offsets.data_ptr<int32_t>();
 
-    const auto index_remappings_acc = index_remappings.data_ptr<int32_t>();
-    const auto index_remappings_offsets_acc = index_remappings_offsets.data_ptr<int64_t>();
-    at::parallel_for(0, T, 1, [&](int64_t begin, int64_t end) {
-      for (const auto t : c10::irange(begin, end)) {
-          int64_t index_remappings_start = index_remappings_offsets_acc[t];
-          int64_t index_remappings_end = index_remappings_offsets_acc[t + 1];
-          int64_t capacity = index_remappings_end - index_remappings_start;
-          int32_t indices_start = offsets_acc[t * B];
-          int32_t indices_end = offsets_acc[(t + 1) * B];
-          if (capacity > 0) {
-            for (const auto i : c10::irange(indices_start,indices_end)) {
-                  int32_t idx = indices_acc[i];
-                  dense_indices_acc[i] = index_remappings_acc[index_remappings_start + idx];
-              }
-          } else {
-              std::memcpy(
-                  dense_indices_acc + indices_start,
-                  indices_acc + indices_start,
-                  (indices_end - indices_start) * sizeof(int32_t));
-          }
-      }
+    AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "pruned_array_lookup_cpu", [&] {
+        const auto* indices_acc = indices.data_ptr<index_t>();
+        auto* dense_indices_acc = dense_indices.data_ptr<index_t>();
+        const auto* offsets_acc = offsets.data_ptr<index_t>();
+
+        const auto index_remappings_acc = index_remappings.data_ptr<index_t>();
+        const auto index_remappings_offsets_acc = index_remappings_offsets.data_ptr<int64_t>();
+
+        at::parallel_for(0, T, 1, [&](int64_t begin, int64_t end) {
+        for (const auto t : c10::irange(begin, end)) {
+            const auto index_remappings_start = index_remappings_offsets_acc[t];
+            const auto index_remappings_end = index_remappings_offsets_acc[t + 1];
+            const auto capacity = index_remappings_end - index_remappings_start;
+
+            const auto indices_start = offsets_acc[t * B];
+            const auto indices_end = offsets_acc[(t + 1) * B];
+
+            if (capacity > 0) {
+                for (const auto i : c10::irange(indices_start, indices_end)) {
+                    auto idx = indices_acc[i];
+                    dense_indices_acc[i] = index_remappings_acc[index_remappings_start + idx];
+                }
+            } else {
+                std::memcpy(
+                    dense_indices_acc + indices_start,
+                    indices_acc + indices_start,
+                    (indices_end - indices_start) * sizeof(index_t));
+            }
+        }
+        });
     });
+
     return dense_indices;
 }
 

diff --git a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_host_cpu.cpp b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_host_cpu.cpp
@@ -21,6 +21,7 @@
 #include "fbgemm_gpu/embedding_common.h"
 #include "fbgemm_gpu/utils/dispatch_macros.h"
 #include "fbgemm_gpu/utils/ops_utils.h"
+#include "fbgemm_gpu/utils/tensor_utils.h"
 
 using Tensor = at::Tensor;
 using namespace fbgemm_gpu;
@@ -374,29 +375,37 @@ class PrunedMapCPU : public torch::jit::CustomClassHolder {
   }
 
   Tensor lookup(Tensor indices, Tensor offsets) const {
+    TENSORS_HAVE_SAME_SCALAR_TYPE(indices, offsets);
+
     int32_t T = maps_.size();
     TORCH_CHECK(T > 0);
     int32_t B = (offsets.size(0) - 1) / T;
     TORCH_CHECK(B > 0);
     TORCH_CHECK(maps_.size() == T);
+
     auto dense_indices = empty_like(indices);
-    const auto* indices_acc = indices.data_ptr<int32_t>();
-    auto* dense_indices_acc = dense_indices.data_ptr<int32_t>();
-    const auto* offsets_acc = offsets.data_ptr<int32_t>();
-    for (const auto t : c10::irange(T)) {
-      auto& map = maps_[t];
-      for (const auto b : c10::irange(B)) {
-        int32_t indices_start = offsets_acc[t * B + b];
-        int32_t indices_end = offsets_acc[t * B + b + 1];
-        int32_t L = indices_end - indices_start;
-        for (const auto l : c10::irange(L)) {
-          int32_t slot_sparse_index = indices_acc[indices_start + l];
-          auto it = map.find(slot_sparse_index);
-          dense_indices_acc[indices_start + l] =
-              it != map.end() ? it->second : -1;
+
+    AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "PrunedMapCPU::lookup", [&] {
+      const auto* indices_acc = indices.data_ptr<index_t>();
+      auto* dense_indices_acc = dense_indices.data_ptr<index_t>();
+      const auto* offsets_acc = offsets.data_ptr<index_t>();
+
+      for (const auto t : c10::irange(T)) {
+        auto& map = maps_[t];
+        for (const auto b : c10::irange(B)) {
+          const auto indices_start = offsets_acc[t * B + b];
+          const auto indices_end = offsets_acc[t * B + b + 1];
+          const auto L = indices_end - indices_start;
+          for (const auto l : c10::irange(L)) {
+            const auto slot_sparse_index = indices_acc[indices_start + l];
+            const auto it = map.find(slot_sparse_index);
+            dense_indices_acc[indices_start + l] =
+                it != map.end() ? it->second : -1;
+          }
         }
       }
-    }
+    });
+
     return dense_indices;
   }