Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TEST CI #882

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/source/reference/env.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,5 @@ Miscellaneous
**AIT_PLOT_SHORTEN_TENSOR_NAMES**: If set to "1", shorten too long tensor names for a plot of a model graph, thus making a plot much easier to analyze visually. "0" by default.

**AIT_USE_FAST_MATH**: If set to "0", no fast math option will be used for the device code generation. Default value is "1".

**AIT_USE_TANH_FOR_SIGMOID**: If set to "1", tanh will be used to approximate sigmoid during device code generation. Default value is "0".
4 changes: 4 additions & 0 deletions fx2ait/fx2ait/fx2ait.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def __init__(
save_remote_cache: Optional[bool] = False,
do_optimize_graph: bool = True,
use_fast_math: bool = True,
use_tanh_for_sigmoid: bool = False,
profile_timeout: int = 500,
optimize_for_compilation_time: bool = False,
):
Expand All @@ -89,6 +90,7 @@ def __init__(
remote_cache_file_path: AITemplate profiling cache location
save_remote_cache: whether to save the updated cache
use_fast_math: whether to use fast math in CUDA kernels
use_tanh_for_sigmoid: whether to use tanh to approximate sigmoid in CUDA kernels
profile_timeout: timeout in seconds for AIT profilers to complete
optimize_for_compilation_time: we use O1 and disable the ProfileImpl function to reduce compilation time.
"""
Expand All @@ -114,6 +116,7 @@ def __init__(
_LOGGER.info(f"Set CACHE_DIR to {self.cache_dir}")
self.use_fp16_acc = use_fp16_acc
self.use_fast_math = use_fast_math
self.use_tanh_for_sigmoid = use_tanh_for_sigmoid
self.optimize_for_compilation_time = optimize_for_compilation_time
self.hardware_target = self._create_target()
self.input_specs = input_specs
Expand Down Expand Up @@ -141,6 +144,7 @@ def _create_target(self):
use_fp16_acc=self.use_fp16_acc,
remote_cache_bytes=self.remote_cache_bytes,
use_fast_math=self.use_fast_math,
use_tanh_for_sigmoid=self.use_tanh_for_sigmoid,
optimize_for_compilation_time=self.optimize_for_compilation_time,
)

Expand Down
2 changes: 2 additions & 0 deletions fx2ait/fx2ait/lower/lower_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,5 @@ class LowerSettings:
trace_ait_module: bool = True
# If True, optimize for compilation time (ie. compile w/ -O1 rather than -O3 and skip profiling codegen)
optimize_for_compilation_time: bool = False
# If True, use tanh to approximate sigmoid in CUDA kernels
use_tanh_for_sigmoid: bool = False
10 changes: 5 additions & 5 deletions python/aitemplate/backend/cuda/elementwise/custom_math.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -161,15 +161,15 @@ __device__ bfloat16 fast_tanh(bfloat16 x) {
}

__device__ float fsigmoid_custom(const float a) {
#if defined(AIT_USE_FAST_MATH)
#if defined(AIT_USE_TANH_FOR_SIGMOID)
return (cutlass::fast_tanh(a * 0.5f) + 1.0f) * 0.5f;
#else
return 1.0f / (1.0f + expf(-a));
#endif
}

__device__ half hsigmoid_custom(const half a) {
#if defined(AIT_USE_FAST_MATH)
#if defined(AIT_USE_TANH_FOR_SIGMOID)
return __hmul(
(__hadd(fast_tanh(__hmul(a, CUDA_FP16_ONE_HALF)), CUDA_FP16_ONE)),
CUDA_FP16_ONE_HALF);
Expand All @@ -179,7 +179,7 @@ __device__ half hsigmoid_custom(const half a) {
}

__device__ half2 h2sigmoid_custom(const half2 a) {
#if defined(AIT_USE_FAST_MATH)
#if defined(AIT_USE_TANH_FOR_SIGMOID)
const auto halfX2 = half2(CUDA_FP16_ONE_HALF, CUDA_FP16_ONE_HALF);
const auto oneX2 = half2(CUDA_FP16_ONE, CUDA_FP16_ONE);
return __hmul2((__hadd2(fast_tanh(__hmul2(a, halfX2)), oneX2)), halfX2);
Expand All @@ -192,7 +192,7 @@ __device__ half2 h2sigmoid_custom(const half2 a) {
__device__ bfloat16 hsigmoid_custom(const bfloat16 a) {
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)

#if defined(AIT_USE_FAST_MATH)
#if defined(AIT_USE_TANH_FOR_SIGMOID)
return __hmul(
(__hadd(fast_tanh(__hmul(a, CUDA_BF16_ONE_HALF)), CUDA_BF16_ONE)),
CUDA_BF16_ONE_HALF);
Expand All @@ -208,7 +208,7 @@ __device__ bfloat16 hsigmoid_custom(const bfloat16 a) {
__device__ bfloat16_2 h2sigmoid_custom(const bfloat16_2 a) {
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)

#if defined(AIT_USE_FAST_MATH)
#if defined(AIT_USE_TANH_FOR_SIGMOID)
const auto halfX2 = bfloat16_2(CUDA_BF16_ONE_HALF, CUDA_BF16_ONE_HALF);
const auto oneX2 = bfloat16_2(CUDA_BF16_ONE, CUDA_BF16_ONE);
return __hmul2((__hadd2(fast_tanh(__hmul2(a, halfX2)), oneX2)), halfX2);
Expand Down
22 changes: 20 additions & 2 deletions python/aitemplate/backend/cuda/target_def.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,10 +178,19 @@ def _build_nvcc_compiler_options(self) -> List[str]:
options.extend(
[
"--use_fast_math",
"-DCUTLASS_USE_TANH_FOR_SIGMOID=1",
"-DAIT_USE_FAST_MATH=1",
]
)
if (
self._kwargs.get("use_tanh_for_sigmoid", False)
or environ.use_tanh_for_sigmoid()
):
options.extend(
[
"-DAIT_USE_TANH_FOR_SIGMOID=1",
"-DCUTLASS_USE_TANH_FOR_SIGMOID=1",
]
)
return options

def get_device_compiler_options(self) -> List[str]:
Expand Down Expand Up @@ -445,10 +454,19 @@ def _build_compile_options(self):
compile_options.extend(
[
"--use_fast_math",
"-DCUTLASS_USE_TANH_FOR_SIGMOID=1",
"-DAIT_USE_FAST_MATH=1",
]
)
if (
self._kwargs.get("use_tanh_for_sigmoid", False)
or environ.use_tanh_for_sigmoid()
):
compile_options.extend(
[
"-DAIT_USE_TANH_FOR_SIGMOID=1",
"-DCUTLASS_USE_TANH_FOR_SIGMOID=1",
]
)
compile_options_str = " ".join(compile_options)
_LOGGER.info(f"The compile options are: {compile_options_str}")
return compile_options_str
Expand Down
13 changes: 11 additions & 2 deletions python/aitemplate/utils/environ.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,25 @@ def get_compiler_opt_level() -> str:
return compiler_opt


def use_fast_math() -> str:
def use_fast_math() -> bool:
"""
Whether the fast math option should be used for the device code generation.
Fast math implies the use of approximate math operations (say,
a division operation), allowing to gain speed at the cost of accuracy.
Default value is "1".
Default value to get from environment variable is "1".
"""
return os.getenv("AIT_USE_FAST_MATH", "1") == "1"


def use_tanh_for_sigmoid() -> bool:
"""
Whether the we want to use tanh to approximate sigmoid for the device code generation.
This controls both the code generation for AITemplate codegen and CUTLASS.
Default value to get from environment variable is "0".
"""
return os.getenv("AIT_USE_TANH_FOR_SIGMOID", "0") == "1"


def enable_cuda_lto() -> bool:
"""
nvcc will use LTO flags during compilation
Expand Down
Loading
Loading