Skip to content

Commit

Permalink
add FLAGS instead max_partition_size
Browse files Browse the repository at this point in the history
  • Loading branch information
zhink committed Dec 12, 2024
1 parent f445a7a commit 8e0ba30
Show file tree
Hide file tree
Showing 24 changed files with 13 additions and 58 deletions.
14 changes: 0 additions & 14 deletions csrc/gpu/append_attention.cu
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
const float out_linear_in_scale,
const int encoder_block_shape_q,
const int decoder_block_shape_q,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
Expand Down Expand Up @@ -209,7 +208,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
quant_max_bound,
quant_min_bound,
out_linear_in_scale,
max_partition_size,
encoder_max_partition_size,
speculate_max_draft_token_num,
causal,
Expand Down Expand Up @@ -248,7 +246,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
quant_max_bound,
quant_min_bound,
out_linear_in_scale,
max_partition_size,
encoder_max_partition_size,
speculate_max_draft_token_num,
causal,
Expand Down Expand Up @@ -292,7 +289,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
quant_max_bound,
quant_min_bound,
out_linear_in_scale,
max_partition_size,
encoder_max_partition_size,
speculate_max_draft_token_num,
causal,
Expand Down Expand Up @@ -440,7 +436,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
quant_max_bound,
quant_min_bound,
out_linear_in_scale,
max_partition_size,
encoder_max_partition_size,
speculate_max_draft_token_num,
causal,
Expand Down Expand Up @@ -479,7 +474,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
quant_max_bound,
quant_min_bound,
out_linear_in_scale,
max_partition_size,
encoder_max_partition_size,
speculate_max_draft_token_num,
causal,
Expand Down Expand Up @@ -524,7 +518,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
quant_max_bound,
quant_min_bound,
out_linear_in_scale,
max_partition_size,
encoder_max_partition_size,
speculate_max_draft_token_num,
causal,
Expand Down Expand Up @@ -585,7 +578,6 @@ std::vector<paddle::Tensor> AppendAttention(
const float out_linear_in_scale,
const int encoder_block_shape_q,
const int decoder_block_shape_q,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
Expand Down Expand Up @@ -650,7 +642,6 @@ std::vector<paddle::Tensor> AppendAttention(
out_linear_in_scale,
encoder_block_shape_q,
decoder_block_shape_q,
max_partition_size,
encoder_max_partition_size,
speculate_max_draft_token_num,
causal,
Expand Down Expand Up @@ -700,7 +691,6 @@ std::vector<paddle::Tensor> AppendAttention(
out_linear_in_scale,
encoder_block_shape_q,
decoder_block_shape_q,
max_partition_size,
encoder_max_partition_size,
speculate_max_draft_token_num,
causal,
Expand Down Expand Up @@ -751,7 +741,6 @@ std::vector<paddle::Tensor> AppendAttention(
out_linear_in_scale,
encoder_block_shape_q,
decoder_block_shape_q,
max_partition_size,
encoder_max_partition_size,
speculate_max_draft_token_num,
causal,
Expand Down Expand Up @@ -800,7 +789,6 @@ std::vector<paddle::Tensor> AppendAttention(
out_linear_in_scale,
encoder_block_shape_q,
decoder_block_shape_q,
max_partition_size,
encoder_max_partition_size,
speculate_max_draft_token_num,
causal,
Expand Down Expand Up @@ -905,7 +893,6 @@ std::vector<paddle::DataType> AppendAttentionInferDtype(
const float out_linear_in_scale,
const int encoder_block_shape_q,
const int decoder_block_shape_q,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
Expand Down Expand Up @@ -985,7 +972,6 @@ PD_BUILD_OP(append_attention)
"out_linear_in_scale: float",
"encoder_block_shape_q: int",
"decoder_block_shape_q: int",
"max_partition_size: int",
"encoder_max_partition_size: int",
"speculate_max_draft_token_num: int",
"causal: bool",
Expand Down
7 changes: 2 additions & 5 deletions csrc/gpu/append_attn/append_attention_c16_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -786,7 +786,6 @@ void MultiQueryAppendAttention(
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool is_decoder,
Expand Down Expand Up @@ -839,7 +838,7 @@ void MultiQueryAppendAttention(
int sm_count;
cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);

uint32_t chunk_size = static_cast<uint32_t>(max_partition_size);
static uint32_t chunk_size = get_max_partition_size();
if (!is_decoder) {
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
}
Expand Down Expand Up @@ -1058,7 +1057,7 @@ void MultiQueryAppendAttention(
int sm_count;
cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);

uint32_t chunk_size = static_cast<uint32_t>(max_partition_size);
static uint32_t chunk_size = get_max_partition_size();
if (!is_decoder) {
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
}
Expand Down Expand Up @@ -1301,7 +1300,6 @@ void CascadeAppendAttentionC16Kernel(
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
Expand Down Expand Up @@ -1363,7 +1361,6 @@ void CascadeAppendAttentionC16Kernel(
quant_max_bound,
quant_min_bound,
in_scale,
max_partition_size,
encoder_max_partition_size,
speculate_max_draft_token_num,
is_decoder,
Expand Down
7 changes: 2 additions & 5 deletions csrc/gpu/append_attn/append_attention_c4_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -973,7 +973,6 @@ void MultiQueryAppendC4Attention(
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool is_decoder,
Expand Down Expand Up @@ -1036,7 +1035,7 @@ void MultiQueryAppendC4Attention(
const float ratio = static_cast<float>(num_blocks_need) /
static_cast<float>(num_blocks_per_wave);

uint32_t chunk_size = static_cast<uint32_t>(max_partition_size);
static uint32_t chunk_size = get_max_partition_size();
if (!is_decoder) {
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
}
Expand Down Expand Up @@ -1282,7 +1281,7 @@ void MultiQueryAppendC4Attention(
static_cast<float>(num_blocks_per_wave);


uint32_t chunk_size = static_cast<uint32_t>(max_partition_size);
static uint32_t chunk_size = get_max_partition_size();
if (!is_decoder) {
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
}
Expand Down Expand Up @@ -1538,7 +1537,6 @@ void CascadeAppendAttentionC4Kernel(
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
Expand Down Expand Up @@ -1604,7 +1602,6 @@ void CascadeAppendAttentionC4Kernel(
quant_max_bound,
quant_min_bound,
in_scale,
max_partition_size,
encoder_max_partition_size,
speculate_max_draft_token_num,
is_decoder,
Expand Down
7 changes: 2 additions & 5 deletions csrc/gpu/append_attn/append_attention_c8_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -860,7 +860,6 @@ void MultiQueryAppendC8Attention(
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool is_decoder,
Expand Down Expand Up @@ -914,7 +913,7 @@ void MultiQueryAppendC8Attention(
const int dev_id = 0;
int sm_count;
cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
uint32_t chunk_size = static_cast<uint32_t>(max_partition_size);
static uint32_t chunk_size = get_max_partition_size();
if (!is_decoder) {
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
}
Expand Down Expand Up @@ -1136,7 +1135,7 @@ void MultiQueryAppendC8Attention(
const int dev_id = 0;
int sm_count;
cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
uint32_t chunk_size = static_cast<uint32_t>(max_partition_size);
static uint32_t chunk_size = get_max_partition_size();
if (!is_decoder) {
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
}
Expand Down Expand Up @@ -1377,7 +1376,6 @@ void CascadeAppendAttentionC8Kernel(
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
Expand Down Expand Up @@ -1441,7 +1439,6 @@ void CascadeAppendAttentionC8Kernel(
quant_max_bound,
quant_min_bound,
in_scale,
max_partition_size,
encoder_max_partition_size,
speculate_max_draft_token_num,
is_decoder,
Expand Down
14 changes: 7 additions & 7 deletions csrc/gpu/append_attn/append_attention_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ void CascadeAppendAttentionC16Kernel(
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
Expand Down Expand Up @@ -97,7 +96,6 @@ void CascadeAppendAttentionC8Kernel(
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
Expand Down Expand Up @@ -142,7 +140,6 @@ void CascadeAppendAttentionC4Kernel(
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
Expand Down Expand Up @@ -188,7 +185,6 @@ void CascadeAppendAttentionKernel(
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
Expand Down Expand Up @@ -223,7 +219,6 @@ void CascadeAppendAttentionKernel(
quant_max_bound,
quant_min_bound,
in_scale,
max_partition_size,
encoder_max_partition_size,
speculate_max_draft_token_num,
causal,
Expand Down Expand Up @@ -258,7 +253,6 @@ void CascadeAppendAttentionKernel(
quant_max_bound,
quant_min_bound,
in_scale,
max_partition_size,
encoder_max_partition_size,
speculate_max_draft_token_num,
causal,
Expand Down Expand Up @@ -293,7 +287,6 @@ void CascadeAppendAttentionKernel(
quant_max_bound,
quant_min_bound,
in_scale,
max_partition_size,
encoder_max_partition_size,
speculate_max_draft_token_num,
causal,
Expand All @@ -307,3 +300,10 @@ void CascadeAppendAttentionKernel(
"cache_int4_zp]");
}
}

inline uint32_t get_max_partition_size() {
static const char* max_partition_size_env = std::getenv("FLAGS_cascade_attention_max_partition_size");
static const uint32_t max_partition_size =
max_partition_size_env == nullptr ? 128 : std::stoul(std::string(max_partition_size_env));
return max_partition_size;
}
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ template void CascadeAppendAttentionC16Kernel<paddle::bfloat16, paddle::bfloat16
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ template void CascadeAppendAttentionC16Kernel<paddle::bfloat16, paddle::float8_e
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ template void CascadeAppendAttentionC16Kernel<paddle::bfloat16, int8_t>(
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ template void CascadeAppendAttentionC16Kernel<paddle::float16, paddle::float16>(
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ template void CascadeAppendAttentionC16Kernel<paddle::float16, paddle::float8_e4
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ template void CascadeAppendAttentionC16Kernel<paddle::float16, int8_t>(
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ template void CascadeAppendAttentionC4Kernel<paddle::bfloat16, paddle::bfloat16>
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ template void CascadeAppendAttentionC4Kernel<paddle::bfloat16, paddle::float8_e4
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ template void CascadeAppendAttentionC4Kernel<paddle::bfloat16, int8_t>(
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ template void CascadeAppendAttentionC4Kernel<paddle::float16, paddle::float16>(
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ template void CascadeAppendAttentionC4Kernel<paddle::float16, paddle::float8_e4m
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
Expand Down
Loading

0 comments on commit 8e0ba30

Please sign in to comment.