diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
index 5769cc393..06c0e4e5a 100644
--- a/.github/scripts/fbgemm_gpu_test.bash
+++ b/.github/scripts/fbgemm_gpu_test.bash
@@ -205,6 +205,22 @@ run_fbgemm_gpu_tests () {
   done
 }
 
+test_all_fbgemm_gpu_modules () {
+  local env_name="$1"
+  local fbgemm_variant="$2"
+
+  local target_directories=(
+    fbgemm_gpu/test
+    fbgemm_gpu/experimental/example/test
+  )
+
+  for test_dir in "${target_directories[@]}"; do
+    cd "${test_dir}"                                        || return 1
+    run_fbgemm_gpu_tests "${env_name}" "${fbgemm_variant}"  || return 1
+    cd -                                                    || return 1
+  done
+}
+
 
 ################################################################################
 # FBGEMM_GPU Test Bulk-Combination Functions
@@ -292,9 +308,8 @@ test_fbgemm_gpu_build_and_install () {
   cd ~/FBGEMM/                                                                || return 1
   install_fbgemm_gpu_wheel    "${env_name}" fbgemm_gpu/dist/*.whl             || return 1
 
-  cd ~/FBGEMM/fbgemm_gpu/test                                                 || return 1
-  run_fbgemm_gpu_tests        "${env_name}" "${pytorch_variant_type}"         || return 1
-  cd -                                                                        || return 1
+  cd ~/FBGEMM/                                                                || return 1
+  test_all_fbgemm_gpu_modules "${env_name}" "${pytorch_variant_type}"         || return 1
 }
 
 test_fbgemm_gpu_setup_and_pip_install () {
@@ -323,11 +338,11 @@ test_fbgemm_gpu_setup_and_pip_install () {
 
     local env_name="test_py${py_version}_pytorch_${pytorch_channel_version}_fbgemm_${fbgemm_gpu_channel_version}_${variant_type}/${variant_version}"
     local env_name="${env_name//\//_}"
-    test_setup_conda_environment  "${env_name}" 'no-compiler' "${py_version}" pip "${pytorch_channel_version}" "${variant_type}" "${variant_version}"  || return 1
-    install_fbgemm_gpu_pip        "${env_name}" "${fbgemm_gpu_channel_version}" "${variant_type}/${variant_version}"                        || return 1
-    cd ~/FBGEMM/fbgemm_gpu/test                                                                                                             || return 1
+    test_setup_conda_environment  "${env_name}" 'no-compiler' "${py_version}" pip "${pytorch_channel_version}" "${variant_type}" "${variant_version}"   || return 1
+    install_fbgemm_gpu_pip        "${env_name}" "${fbgemm_gpu_channel_version}" "${variant_type}/${variant_version}"                                    || return 1
+    cd ~/FBGEMM                                                                                                                                         || return 1
 
-    run_fbgemm_gpu_tests "${env_name}" "${variant_type}";
+    test_all_fbgemm_gpu_modules "${env_name}" "${variant_type}";
     local retcode=$?
 
     echo "################################################################################"
diff --git a/.github/scripts/nova_postscript.bash b/.github/scripts/nova_postscript.bash
index a9f2ad992..dc3871ca7 100644
--- a/.github/scripts/nova_postscript.bash
+++ b/.github/scripts/nova_postscript.bash
@@ -42,8 +42,8 @@ else
 fi
 
 $CONDA_RUN python3 -c "import torch; print('cuda.is_available() ', torch.cuda.is_available()); print ('device_count() ',torch.cuda.device_count());"
-cd "${FBGEMM_REPO}/fbgemm_gpu/test" || { echo "[NOVA] Failed to cd to fbgemm_gpu/test from $(pwd)"; };
-run_fbgemm_gpu_tests "${BUILD_ENV_NAME}" "${fbgemm_variant}"
+cd "${FBGEMM_REPO}" || { echo "[NOVA] Failed to cd to ${FBGEMM_REPO} from $(pwd)"; };
+test_all_fbgemm_gpu_modules "${BUILD_ENV_NAME}" "${fbgemm_variant}"
 
 # Workaround EACCES: permission denied error at checkout step
 chown -R 1000:1000 /__w/FBGEMM/FBGEMM/ || echo "Unable to chown 1000:1000 from $USER, uid: $(id -u)"
diff --git a/.github/scripts/utils_base.bash b/.github/scripts/utils_base.bash
index 7ea56f816..bb814617f 100644
--- a/.github/scripts/utils_base.bash
+++ b/.github/scripts/utils_base.bash
@@ -88,7 +88,7 @@ env_name_or_prefix () {
 }
 
 test_network_connection () {
-  wget -q --timeout 1 pypi.org -O /dev/null
+  exec_with_retries 3 wget -q --timeout 1 pypi.org -O /dev/null
   local exit_status=$?
 
   # https://man7.org/linux/man-pages/man1/wget.1.html
@@ -96,7 +96,8 @@ test_network_connection () {
     echo "[CHECK] Network does not appear to be blocked."
   else
     echo "[CHECK] Network check exit status: ${exit_status}"
-    echo "[CHECK] Network appears to be blocked; please proxy the network connetions, i.e. re-run the command prefixed with 'with-proxy'."
+    echo "[CHECK] Network appears to be blocked or suffering from poor connection."
+    echo "[CHECK] Please remember to proxy the network connetions if needed, i.e. re-run the command prefixed with 'with-proxy'."
     return 1
   fi
 }
diff --git a/.github/workflows/fbgemm_gpu_ci_cpu.yml b/.github/workflows/fbgemm_gpu_ci_cpu.yml
index 9d19b06f9..e5fd8d0ad 100644
--- a/.github/workflows/fbgemm_gpu_ci_cpu.yml
+++ b/.github/workflows/fbgemm_gpu_ci_cpu.yml
@@ -182,7 +182,7 @@ jobs:
 
     - name: Test with PyTest
       timeout-minutes: ${{ matrix.host-machine.timeout }}
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
+      run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV cpu
 
     - name: Push Wheel to PyPI
       if: ${{ (github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true')) && matrix.compiler == 'gcc' }}
diff --git a/.github/workflows/fbgemm_gpu_ci_cuda.yml b/.github/workflows/fbgemm_gpu_ci_cuda.yml
index b76870245..fd68558f2 100644
--- a/.github/workflows/fbgemm_gpu_ci_cuda.yml
+++ b/.github/workflows/fbgemm_gpu_ci_cuda.yml
@@ -202,7 +202,7 @@ jobs:
 
     - name: Test with PyTest
       timeout-minutes: 20
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cuda
+      run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV cuda
 
     - name: Push Wheel to PyPI
       if: ${{ (github.event_name == 'schedule' && matrix.cuda-version == matrix.cuda-version-publish) || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == matrix.cuda-version-publish) }}
diff --git a/.github/workflows/fbgemm_gpu_ci_rocm.yml b/.github/workflows/fbgemm_gpu_ci_rocm.yml
index f3fca6f5b..4e35f8cd5 100644
--- a/.github/workflows/fbgemm_gpu_ci_rocm.yml
+++ b/.github/workflows/fbgemm_gpu_ci_rocm.yml
@@ -191,4 +191,4 @@ jobs:
 
     - name: Test with PyTest
       timeout-minutes: 20
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
+      run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV rocm
diff --git a/.github/workflows/fbgemm_gpu_pip.yml b/.github/workflows/fbgemm_gpu_pip.yml
index 8ef3f1d85..342f56294 100644
--- a/.github/workflows/fbgemm_gpu_pip.yml
+++ b/.github/workflows/fbgemm_gpu_pip.yml
@@ -99,7 +99,7 @@ jobs:
 
     - name: Test with PyTest
       timeout-minutes: ${{ matrix.host-machine.timeout }}
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
+      run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV cpu
 
 
   test_pypi_install_cuda:
@@ -159,7 +159,7 @@ jobs:
 
     - name: Test with PyTest
       timeout-minutes: 20
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cuda
+      run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV cuda
 
 
   test_pypi_install_rocm:
@@ -225,4 +225,4 @@ jobs:
 
     - name: Test with PyTest
       timeout-minutes: 20
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
+      run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV rocm
diff --git a/.github/workflows/fbgemm_gpu_release_cpu.yml b/.github/workflows/fbgemm_gpu_release_cpu.yml
index 426143814..a21a90eb0 100644
--- a/.github/workflows/fbgemm_gpu_release_cpu.yml
+++ b/.github/workflows/fbgemm_gpu_release_cpu.yml
@@ -174,7 +174,7 @@ jobs:
 
     - name: Test with PyTest
       timeout-minutes: ${{ matrix.host-machine.timeout }}
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
+      run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV cpu
 
     - name: Push FBGEMM_GPU (CPU version) Binary to PYPI
       if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' }}
diff --git a/.github/workflows/fbgemm_gpu_release_cuda.yml b/.github/workflows/fbgemm_gpu_release_cuda.yml
index ea806f357..c64082660 100644
--- a/.github/workflows/fbgemm_gpu_release_cuda.yml
+++ b/.github/workflows/fbgemm_gpu_release_cuda.yml
@@ -184,7 +184,7 @@ jobs:
 
     - name: Test with PyTest
       timeout-minutes: 20
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cuda
+      run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV cuda
 
     - name: Push FBGEMM_GPU Binary to PYPI
       if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == github.event.inputs.cuda_version }}
diff --git a/cmake/modules/CudaSetup.cmake b/cmake/modules/CudaSetup.cmake
new file mode 100644
index 000000000..d86963109
--- /dev/null
+++ b/cmake/modules/CudaSetup.cmake
@@ -0,0 +1,28 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/modules/Utilities.cmake)
+
+
+################################################################################
+# CUDA Setup
+################################################################################
+
+# Set NVML_LIB_PATH if provided, or detect the default lib path
+if(NOT NVML_LIB_PATH)
+  set(DEFAULT_NVML_LIB_PATH
+      "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libnvidia-ml.so")
+
+  if(EXISTS ${DEFAULT_NVML_LIB_PATH})
+    message(STATUS "Setting NVML_LIB_PATH: \
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libnvidia-ml.so")
+    set(NVML_LIB_PATH "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libnvidia-ml.so")
+  endif()
+endif()
+
+if(NVML_LIB_PATH)
+  message(STATUS "Found NVML_LIB_PATH: ${NVML_LIB_PATH}")
+endif()
diff --git a/cmake/modules/CxxCompilerSetup.cmake b/cmake/modules/CxxCompilerSetup.cmake
new file mode 100644
index 000000000..11fb3f891
--- /dev/null
+++ b/cmake/modules/CxxCompilerSetup.cmake
@@ -0,0 +1,83 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/modules/Utilities.cmake)
+
+
+################################################################################
+# CMake C++ Setup
+################################################################################
+
+# SET THE C AND C++ VERSIONS HERE
+set(C_VERSION 17)
+set(CXX_VERSION 20)
+
+# Set the default C++ standard to CXX_VERSION if CMAKE_CXX_STANDARD is not
+# supplied by CMake command invocation.
+# Individual targets can have this value overridden; see
+# https://cmake.org/cmake/help/latest/variable/CMAKE_CXX_STANDARD.html
+# https://cmake.org/cmake/help/latest/prop_tgt/CXX_STANDARD.html
+# https://cmake.org/cmake/help/latest/prop_tgt/HIP_STANDARD.html
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD ${CXX_VERSION})
+  set(CMAKE_HIP_STANDARD ${CXX_VERSION})
+  set(CXX_STANDARD ${CXX_VERSION})
+  set(HIP_STANDARD ${CXX_VERSION})
+endif()
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(HIP_STANDARD_REQUIRED ON)
+
+# Set the default C standard to C_VERSION if CMAKE_C_STANDARD is not supplied
+# by CMake command invocation.
+# Individual targets can have this value overridden; see
+# https://cmake.org/cmake/help/latest/variable/CMAKE_C_STANDARD.html
+# https://cmake.org/cmake/help/latest/prop_tgt/C_STANDARD.html
+if(NOT CMAKE_C_STANDARD)
+  set(C_STANDARD ${C_VERSION})
+  set(CMAKE_C_STANDARD ${C_VERSION})
+endif()
+set(CMAKE_C_EXTENSIONS OFF)
+set(CMAKE_C_STANDARD_REQUIRED ON)
+
+if(DEFINED GLIBCXX_USE_CXX11_ABI)
+  if(${GLIBCXX_USE_CXX11_ABI} EQUAL 1)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
+  else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+  endif()
+endif()
+
+BLOCK_PRINT(
+  "Default C compiler flags"
+  "(values may be overridden by CMAKE_CXX_STANDARD and CXX_STANDARD):"
+  ""
+  "${CMAKE_C_FLAGS}"
+)
+
+BLOCK_PRINT(
+  "Default C++ compiler flags"
+  "(values may be overridden by CMAKE_CXX_STANDARD and CXX_STANDARD):"
+  ""
+  "${CMAKE_CXX_FLAGS}"
+)
+
+# Strip all symbols from the .SO file after building
+add_link_options($<$<CONFIG:RELEASE>:-s>)
+
+# Set flags for AVX2
+set(AVX2_FLAGS "-mavx2;-mf16c;-mfma;-fopenmp")
+if(NOT FBGEMM_CPU_ONLY AND WSL_MODE)
+  # NVCC in WSL complains about unknown -mavx options
+  # https://github.com/pytorch/FBGEMM/issues/2135
+  set(AVX2_FLAGS "-Xcompiler;-mavx;-Xcompiler;-mavx2;-Xcompiler;-mf16c;-Xcompiler;-mfma;-fopenmp")
+endif()
+
+# Set flags for AVX512
+set(AVX512_FLAGS "-mavx2;-mf16c;-mfma;-mavx512f;-mavx512bw;-mavx512dq;-mavx512vl;-fopenmp")
+if(NOT FBGEMM_CPU_ONLY AND WSL_MODE)
+  set(AVX512_FLAGS "-Xcompiler;-mavx2;-Xcompiler;-mf16c;-Xcompiler;-mfma;-Xcompiler;-mavx512f;-Xcompiler;-mavx512bw;-Xcompiler;-mavx512dq;-Xcompiler;-mavx512vl;-fopenmp")
+endif()
diff --git a/cmake/modules/FindAVX.cmake b/cmake/modules/FindAVX.cmake
index 0cf20f5a4..5bd8cffd6 100644
--- a/cmake/modules/FindAVX.cmake
+++ b/cmake/modules/FindAVX.cmake
@@ -82,7 +82,6 @@ MACRO(CHECK_SSE lang type flags)
   ENDIF()
 
   MARK_AS_ADVANCED(${lang}_${type}_FOUND ${lang}_${type}_FLAGS)
-
 ENDMACRO()
 
 CHECK_SSE(C "AVX" " ;-mavx;/arch:AVX")
diff --git a/cmake/modules/PyTorchSetup.cmake b/cmake/modules/PyTorchSetup.cmake
new file mode 100644
index 000000000..a5b73eb6f
--- /dev/null
+++ b/cmake/modules/PyTorchSetup.cmake
@@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/modules/Utilities.cmake)
+
+
+################################################################################
+# PyTorch Dependencies Setup
+################################################################################
+
+find_package(Torch REQUIRED)
+
+#
+# Toch Cuda Extensions are normally compiled with the flags below. However we
+# disabled -D__CUDA_NO_HALF_CONVERSIONS__ here as it caused "error: no suitable
+# constructor exists to convert from "int" to "__half" errors in
+# gen_embedding_forward_quantized_split_[un]weighted_codegen_cuda.cu
+#
+
+set(TORCH_CUDA_OPTIONS
+    --expt-relaxed-constexpr -D__CUDA_NO_HALF_OPERATORS__
+    # -D__CUDA_NO_HALF_CONVERSIONS__
+    -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__)
diff --git a/cmake/modules/RocmSetup.cmake b/cmake/modules/RocmSetup.cmake
new file mode 100644
index 000000000..7e37893bf
--- /dev/null
+++ b/cmake/modules/RocmSetup.cmake
@@ -0,0 +1,38 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/modules/Utilities.cmake)
+
+
+################################################################################
+# ROCm and HIPify Setup
+################################################################################
+
+if(USE_ROCM)
+  # Load CMake modules
+  list(APPEND CMAKE_MODULE_PATH
+    "${PROJECT_SOURCE_DIR}/cmake"
+    "${THIRDPARTY}/hipify_torch/cmake")
+  include(Hip)
+  include(Hipify)
+
+  # Configure compiler for HIP
+  list(APPEND HIP_HCC_FLAGS
+    " \"-Wno-#pragma-messages\" "
+    " \"-Wno-#warnings\" "
+    -Wno-cuda-compat
+    -Wno-deprecated-declarations
+    -Wno-format
+    -Wno-ignored-attributes
+    -Wno-unused-result)
+
+  BLOCK_PRINT(
+    "HIP found: ${HIP_FOUND}"
+    "HIPCC compiler flags:"
+    ""
+    "${HIP_HCC_FLAGS}"
+  )
+endif()
diff --git a/cmake/modules/Utilities.cmake b/cmake/modules/Utilities.cmake
new file mode 100644
index 000000000..2630a22df
--- /dev/null
+++ b/cmake/modules/Utilities.cmake
@@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+################################################################################
+# Utility Functions
+################################################################################
+
+function(BLOCK_PRINT)
+  message("")
+  message("")
+  message("================================================================================")
+  foreach(ARG IN LISTS ARGN)
+     message("${ARG}")
+  endforeach()
+  message("================================================================================")
+  message("")
+endfunction()
diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index b23bf74af..7b3ba7ecd 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -10,20 +10,12 @@
 
 cmake_minimum_required(VERSION 3.25.0 FATAL_ERROR)
 
-function(BLOCK_PRINT)
-  message("")
-  message("")
-  message("================================================================================")
-  foreach(ARG IN LISTS ARGN)
-     message("${ARG}")
-  endforeach()
-  message("================================================================================")
-  message("")
-endfunction()
-
 set(CMAKEMODULES ${CMAKE_CURRENT_SOURCE_DIR}/../cmake/modules)
 set(FBGEMM ${CMAKE_CURRENT_SOURCE_DIR}/..)
 set(THIRDPARTY ${FBGEMM}/third_party)
+set(CMAKE_CODEGEN_DIR ${CMAKE_CURRENT_SOURCE_DIR}/codegen)
+
+include(${CMAKEMODULES}/Utilities.cmake)
 
 
 ################################################################################
@@ -53,81 +45,13 @@ else()
 endif()
 
 
-################################################################################
-# FBGEMM_GPU C++ Setup
-################################################################################
-
-# Set the default C++ standard to C++20 if CMAKE_CXX_STANDARD is not supplied
-# by CMake command invocation.
-# Individual targets can have this value overridden; see
-# https://cmake.org/cmake/help/latest/variable/CMAKE_CXX_STANDARD.html
-# https://cmake.org/cmake/help/latest/prop_tgt/CXX_STANDARD.html
-# https://cmake.org/cmake/help/latest/prop_tgt/HIP_STANDARD.html
-if(NOT CMAKE_CXX_STANDARD)
-  set(CMAKE_CXX_STANDARD 20)
-  set(CMAKE_HIP_STANDARD 20)
-  set(CXX_STANDARD 20)
-  set(HIP_STANDARD 20)
-endif()
-set(CMAKE_CXX_EXTENSIONS OFF)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(HIP_STANDARD_REQUIRED ON)
-
-# Set the default C standard to C17
-# Individual targets can have this value overridden; see
-# https://cmake.org/cmake/help/latest/variable/CMAKE_C_STANDARD.html
-# https://cmake.org/cmake/help/latest/prop_tgt/C_STANDARD.html
-set(C_STANDARD 20)
-set(CMAKE_C_STANDARD 17)
-set(CMAKE_C_EXTENSIONS OFF)
-set(CMAKE_C_STANDARD_REQUIRED ON)
-
-if(DEFINED GLIBCXX_USE_CXX11_ABI)
-  if(${GLIBCXX_USE_CXX11_ABI} EQUAL 1)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
-  else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
-  endif()
-endif()
-
-BLOCK_PRINT(
-  "Default C compiler flags"
-  "(values may be overridden by CMAKE_CXX_STANDARD and CXX_STANDARD):"
-  ""
-  "${CMAKE_C_FLAGS}"
-)
-
-BLOCK_PRINT(
-  "Default C++ compiler flags"
-  "(values may be overridden by CMAKE_CXX_STANDARD and CXX_STANDARD):"
-  ""
-  "${CMAKE_CXX_FLAGS}"
-)
-
-# Strip all symbols from the .SO file after building
-add_link_options($<$<CONFIG:RELEASE>:-s>)
-
-# Set flags for AVX2
-set(AVX2_FLAGS "-mavx2;-mf16c;-mfma;-fopenmp")
-if(NOT FBGEMM_CPU_ONLY AND WSL_MODE)
-  # NVCC in WSL complains about unknown -mavx options
-  # https://github.com/pytorch/FBGEMM/issues/2135
-  set(AVX2_FLAGS "-Xcompiler;-mavx;-Xcompiler;-mavx2;-Xcompiler;-mf16c;-Xcompiler;-mfma;-fopenmp")
-endif()
-
-# Set flags for AVX512
-set(AVX512_FLAGS "-mavx2;-mf16c;-mfma;-mavx512f;-mavx512bw;-mavx512dq;-mavx512vl;-fopenmp")
-if(NOT FBGEMM_CPU_ONLY AND WSL_MODE)
-  set(AVX512_FLAGS "-Xcompiler;-mavx2;-Xcompiler;-mf16c;-Xcompiler;-mfma;-Xcompiler;-mavx512f;-Xcompiler;-mavx512bw;-Xcompiler;-mavx512dq;-Xcompiler;-mavx512vl;-fopenmp")
-endif()
-
-set(CMAKE_CODEGEN_DIR ${CMAKE_CURRENT_SOURCE_DIR}/codegen)
-
-
 ################################################################################
 # FBGEMM_GPU Build Kickstart
 ################################################################################
 
+# FBGEMM_GPU C++ Setup - must be set BEFORE project declaration
+include(${CMAKEMODULES}/CxxCompilerSetup.cmake)
+
 if(SKBUILD)
   BLOCK_PRINT("The project is built using scikit-build")
 endif()
@@ -135,87 +59,26 @@ endif()
 if(FBGEMM_CPU_ONLY OR USE_ROCM)
   project(
     fbgemm_gpu
-    VERSION 0.3.1
+    VERSION 0.7.0
     LANGUAGES CXX C)
 else()
   project(
     fbgemm_gpu
-    VERSION 0.3.1
+    VERSION 0.7.0
     LANGUAGES CXX C CUDA)
 endif()
 
+# AVX Flags Setup - must be set AFTER project declaration
 include(${CMAKEMODULES}/FindAVX.cmake)
 
-
-################################################################################
 # PyTorch Dependencies Setup
-################################################################################
-
-find_package(Torch REQUIRED)
+include(${CMAKEMODULES}/PyTorchSetup.cmake)
 
-#
-# Toch Cuda Extensions are normally compiled with the flags below. However we
-# disabled -D__CUDA_NO_HALF_CONVERSIONS__ here as it caused "error: no suitable
-# constructor exists to convert from "int" to "__half" errors in
-# gen_embedding_forward_quantized_split_[un]weighted_codegen_cuda.cu
-#
-
-set(TORCH_CUDA_OPTIONS
-    --expt-relaxed-constexpr -D__CUDA_NO_HALF_OPERATORS__
-    # -D__CUDA_NO_HALF_CONVERSIONS__
-    -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__)
-
-
-################################################################################
 # CUDA Setup
-################################################################################
-
-# Set NVML_LIB_PATH if provided, or detect the default lib path
-if(NOT NVML_LIB_PATH)
-  set(DEFAULT_NVML_LIB_PATH
-      "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libnvidia-ml.so")
-
-  if(EXISTS ${DEFAULT_NVML_LIB_PATH})
-    message(STATUS "Setting NVML_LIB_PATH: \
-      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libnvidia-ml.so")
-    set(NVML_LIB_PATH "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libnvidia-ml.so")
-  endif()
-endif()
-
-if(NVML_LIB_PATH)
-  message(STATUS "Found NVML_LIB_PATH: ${NVML_LIB_PATH}")
-endif()
-
+include(${CMAKEMODULES}/CudaSetup.cmake)
 
-################################################################################
 # ROCm and HIPify Setup
-################################################################################
-
-if(USE_ROCM)
-  # Load CMake modules
-  list(APPEND CMAKE_MODULE_PATH
-    "${PROJECT_SOURCE_DIR}/cmake"
-    "${THIRDPARTY}/hipify_torch/cmake")
-  include(Hip)
-  include(Hipify)
-
-  # Configure compiler for HIP
-  list(APPEND HIP_HCC_FLAGS
-    " \"-Wno-#pragma-messages\" "
-    " \"-Wno-#warnings\" "
-    -Wno-cuda-compat
-    -Wno-deprecated-declarations
-    -Wno-format
-    -Wno-ignored-attributes
-    -Wno-unused-result)
-
-  BLOCK_PRINT(
-    "HIP found: ${HIP_FOUND}"
-    "HIPCC compiler flags:"
-    ""
-    "${HIP_HCC_FLAGS}"
-  )
-endif()
+include(${CMAKEMODULES}/RocmSetup.cmake)
 
 
 ################################################################################
@@ -823,3 +686,11 @@ install(FILES ${gen_python_source_files}
 
 install(FILES ${gen_defused_optim_py_files}
         DESTINATION fbgemm_gpu/split_embedding_optimizer_codegen)
+
+
+
+################################################################################
+# Build Experimental Modules
+################################################################################
+
+add_subdirectory(experimental/example)
diff --git a/fbgemm_gpu/experimental/example/CMakeLists.txt b/fbgemm_gpu/experimental/example/CMakeLists.txt
new file mode 100644
index 000000000..d6d4b55aa
--- /dev/null
+++ b/fbgemm_gpu/experimental/example/CMakeLists.txt
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+include(${CMAKEMODULES}/Utilities.cmake)
+
+################################################################################
+# Target Sources
+################################################################################
+
+set(experimental_example_cpp_source_files
+    src/example_ops.cpp)
+
+set(experimental_example_python_source_files
+    example/__init__.py
+    example/utils.py)
+
+################################################################################
+# Build Shared Library
+################################################################################
+
+add_library(fbgemm_gpu_experimental_example_py MODULE
+    ${experimental_example_cpp_source_files})
+
+target_include_directories(fbgemm_gpu_experimental_example_py PRIVATE ${TORCH_INCLUDE_DIRS})
+target_link_libraries(fbgemm_gpu_experimental_example_py ${TORCH_LIBRARIES})
+
+# Remove `lib` from the output artifact name `libfbgemm_gpu_py.so`
+set_target_properties(fbgemm_gpu_experimental_example_py PROPERTIES PREFIX "")
+
+################################################################################
+# Install Shared Library and Python Files
+################################################################################
+
+install(TARGETS fbgemm_gpu_experimental_example_py
+        DESTINATION fbgemm_gpu/experimental/example)
+
+install(FILES ${experimental_example_python_source_files}
+        DESTINATION fbgemm_gpu/experimental/example)
diff --git a/fbgemm_gpu/experimental/example/example/__init__.py b/fbgemm_gpu/experimental/example/example/__init__.py
new file mode 100644
index 000000000..d4bea7d44
--- /dev/null
+++ b/fbgemm_gpu/experimental/example/example/__init__.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+
+import torch
+
+try:
+    torch.ops.load_library(
+        os.path.join(os.path.dirname(__file__), "fbgemm_gpu_experimental_example_py.so")
+    )
+except Exception as e:
+    print(e)
+
+# Since __init__.py is only used in OSS context, we define `open_source` here
+# and use its existence to determine whether or not we are in OSS context
+open_source: bool = True
diff --git a/fbgemm_gpu/experimental/example/example/utils.py b/fbgemm_gpu/experimental/example/example/utils.py
new file mode 100644
index 000000000..19a98377f
--- /dev/null
+++ b/fbgemm_gpu/experimental/example/example/utils.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import torch
+
+
+def add_tensors(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+    return torch.ops.fbgemm.add_tensors_float(a, b)
diff --git a/fbgemm_gpu/experimental/example/src/example_ops.cpp b/fbgemm_gpu/experimental/example/src/example_ops.cpp
new file mode 100644
index 000000000..585630373
--- /dev/null
+++ b/fbgemm_gpu/experimental/example/src/example_ops.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <torch/library.h>
+
+namespace fbgemm_gpu::experimental {
+
+at::Tensor add_tensors_float(const at::Tensor& a, const at::Tensor& b) {
+  return a.to(at::kFloat) + b.to(at::kFloat);
+}
+
+TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
+  m.def("add_tensors_float(Tensor a, Tensor b) -> Tensor");
+}
+
+TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {
+  m.impl(
+      "add_tensors_float",
+      torch::dispatch(
+          c10::DispatchKey::CPU,
+          TORCH_FN(fbgemm_gpu::experimental::add_tensors_float)));
+}
+
+} // namespace fbgemm_gpu::experimental
diff --git a/fbgemm_gpu/experimental/example/test/add_tensors_float_test.py b/fbgemm_gpu/experimental/example/test/add_tensors_float_test.py
new file mode 100644
index 000000000..5d0cd40e2
--- /dev/null
+++ b/fbgemm_gpu/experimental/example/test/add_tensors_float_test.py
@@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import unittest
+
+import torch
+
+from fbgemm_gpu.experimental.example import utils
+
+
+class ExampleTest(unittest.TestCase):
+    def test_add_tensors_float(self) -> None:
+        a = torch.tensor([1, 2, 3])
+        b = torch.tensor([4, 5, 6])
+        expected = torch.tensor([5, 7, 9], dtype=torch.float)
+        c = utils.add_tensors(a, b)
+        torch.testing.assert_close(c.cpu(), expected.cpu())
+
+
+if __name__ == "__main__":
+    unittest.main()