From e91c2d3d634b12769c30aa419ddf931c20b7ca9f Mon Sep 17 00:00:00 2001 From: AIbin <37361953+chang-wenbin@users.noreply.github.com> Date: Fri, 20 Dec 2024 17:02:39 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90Qwen2-VL=20Inference=E3=80=91add=20qwe?= =?UTF-8?q?n2-vl=20high=20performance=20inference=20(#9575)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * qwen2-vl --- .../transformers/qwen2/modeling.py | 157 +++++++++--------- paddlenlp/transformers/auto/configuration.py | 3 + 2 files changed, 86 insertions(+), 74 deletions(-) diff --git a/paddlenlp/experimental/transformers/qwen2/modeling.py b/paddlenlp/experimental/transformers/qwen2/modeling.py index 61405f6c4e28..6098079d9084 100644 --- a/paddlenlp/experimental/transformers/qwen2/modeling.py +++ b/paddlenlp/experimental/transformers/qwen2/modeling.py @@ -64,7 +64,11 @@ from paddlenlp.utils.download import resolve_file_path from paddlenlp.utils.log import logger -__all__ = ["Qwen2ForCausalLMInferenceModel", "Qwen2ForCausalLMBlockInferenceModel"] +__all__ = [ + "Qwen2ForCausalLMInferenceModel", + "Qwen2ForCausalLMBlockInferenceModel", + "Qwen2VLForConditionalGenerationBlockInferenceModel", +] class FusedQwen2RMSNorm(nn.Layer): @@ -551,23 +555,28 @@ def set_state_dict(self, state_dict): self.transformer_block.init_weight() split_fn = split_param_func() self.embed_tokens.weight.set_value( - paddle.to_tensor(state_dict["qwen2.embed_tokens.weight"]).cast(self.embed_tokens.weight.dtype) + paddle.to_tensor(state_dict[f"{self.base_model_prefix}.embed_tokens.weight"]).cast( + self.embed_tokens.weight.dtype + ) + ) + self.norm.weight.set_value( + paddle.to_tensor(state_dict[f"{self.base_model_prefix}.norm.weight"]).cast(self.norm.weight.dtype) ) - self.norm.weight.set_value(paddle.to_tensor(state_dict["qwen2.norm.weight"]).cast(self.norm.weight.dtype)) for idx in range(self.num_layers): + model_prefix = self.base_model_prefix + f".layers.{idx}" logger.info(f"set state for layer {idx}") - ln_scale = paddle.to_tensor(state_dict["qwen2.layers.{}.input_layernorm.weight".format(idx)]).cast( + ln_scale = paddle.to_tensor(state_dict[f"{model_prefix}.input_layernorm.weight"]).cast( self.transformer_block.ln_scales[idx].dtype ) self.transformer_block.ln_scales[idx].set_value(ln_scale) - if "qwen2.layers.{}.self_attn.qkv_proj.weight".format(idx) in state_dict.keys(): + if f"{model_prefix}.self_attn.qkv_proj.weight" in state_dict.keys(): concated_qkv_weight = paddle.to_tensor( np.concatenate( split_fn( - state_dict["qwen2.layers.{}.self_attn.qkv_proj.weight".format(idx)], + state_dict[f"{model_prefix}.self_attn.qkv_proj.weight"], is_qkv=True, num_heads=self.num_attention_heads // self.config.tensor_parallel_degree, num_key_value_heads=self.num_key_value_heads // self.config.tensor_parallel_degree, @@ -578,13 +587,13 @@ def set_state_dict(self, state_dict): else: unfused_state_dict = {} unfused_state_dict["self_attn.q_proj.weight"] = paddle.to_tensor( - state_dict["qwen2.layers.{}.self_attn.q_proj.weight".format(idx)] + state_dict[f"{model_prefix}.self_attn.q_proj.weight"] ) unfused_state_dict["self_attn.k_proj.weight"] = paddle.to_tensor( - state_dict["qwen2.layers.{}.self_attn.k_proj.weight".format(idx)] + state_dict[f"{model_prefix}.self_attn.k_proj.weight"] ) unfused_state_dict["self_attn.v_proj.weight"] = paddle.to_tensor( - state_dict["qwen2.layers.{}.self_attn.v_proj.weight".format(idx)] + state_dict[f"{model_prefix}.self_attn.v_proj.weight"] ) if "fp8" in self.quant_type: q_wgt_scale = self.transformer_block.weight_scales["q_weight_scale"][idx] @@ -658,30 +667,17 @@ def set_state_dict(self, state_dict): else: self.transformer_block.qkv_weights[idx].set_value(qkv_weight) - unfused_state_dict["qwen2.self_attn.q_proj.bias"] = state_dict[ - "qwen2.layers.{}.self_attn.q_proj.bias".format(idx) - ] - unfused_state_dict["qwen2.self_attn.k_proj.bias"] = state_dict[ - "qwen2.layers.{}.self_attn.k_proj.bias".format(idx) - ] - unfused_state_dict["qwen2.self_attn.v_proj.bias"] = state_dict[ - "qwen2.layers.{}.self_attn.v_proj.bias".format(idx) - ] + q_bias = state_dict[f"{model_prefix}.self_attn.q_proj.bias"] + k_bias = state_dict[f"{model_prefix}.self_attn.k_proj.bias"] + v_bias = state_dict[f"{model_prefix}.self_attn.v_proj.bias"] - concated_qkv_biases = np.concatenate( - [ - unfused_state_dict["qwen2.self_attn.q_proj.bias"], - unfused_state_dict["qwen2.self_attn.k_proj.bias"], - unfused_state_dict["qwen2.self_attn.v_proj.bias"], - ], - axis=-1, - ) + concated_qkv_biases = np.concatenate([q_bias, k_bias, v_bias], axis=-1) qkv_bias = paddle.to_tensor(concated_qkv_biases) self.transformer_block.qkv_biases[idx].set_value( qkv_bias.cast(self.transformer_block.qkv_biases[idx].dtype) ) - linear_weight = paddle.to_tensor(state_dict["qwen2.layers.{}.self_attn.o_proj.weight".format(idx)]).cast( + linear_weight = paddle.to_tensor(state_dict[f"{model_prefix}.self_attn.o_proj.weight"]).cast( paddle.get_default_dtype() ) if self.use_weight_only: @@ -691,9 +687,7 @@ def set_state_dict(self, state_dict): elif "fp8" in self.quant_type: self.transformer_block.linear_weights[idx].copy_( paddle.cast( - paddle.to_tensor(state_dict["qwen2.layers.{}.self_attn.o_proj.weight".format(idx)]).transpose( - (1, 0) - ), + paddle.to_tensor(state_dict[f"{model_prefix}.self_attn.o_proj.weight"]).transpose((1, 0)), "float8_e4m3fn", ), False, @@ -707,16 +701,14 @@ def set_state_dict(self, state_dict): if paddle.is_compiled_with_rocm(): self.transformer_block.linear_weights[idx].set_value( paddle.cast( - paddle.to_tensor(state_dict["qwen2.layers.{}.self_attn.o_proj.weight".format(idx)]), + paddle.to_tensor(state_dict[f"{model_prefix}.self_attn.o_proj.weight"]), w_dtype, ) ) else: self.transformer_block.linear_weights[idx].set_value( paddle.cast( - paddle.to_tensor( - state_dict["qwen2.layers.{}.self_attn.o_proj.weight".format(idx)] - ).transpose((1, 0)), + paddle.to_tensor(state_dict[f"{model_prefix}.self_attn.o_proj.weight"]).transpose((1, 0)), w_dtype, ) ) @@ -726,22 +718,20 @@ def set_state_dict(self, state_dict): ) ffn_ln_scale = paddle.to_tensor( - state_dict["qwen2.layers.{}.post_attention_layernorm.weight".format(idx)], + state_dict[f"{model_prefix}.post_attention_layernorm.weight"], ) self.transformer_block.ffn_ln_scales[idx].set_value( ffn_ln_scale.cast(self.transformer_block.ffn_ln_scales[idx].dtype) ) - if "qwen2.layers.{}.mlp.gate_up_fused_proj.weight".format(idx) in state_dict.keys(): + if f"{model_prefix}.mlp.gate_up_fused_proj.weight" in state_dict.keys(): concated_ffn1_weight = np.concatenate( - split_fn(state_dict["qwen2.layers.{}.mlp.gate_up_fused_proj.weight".format(idx)]), axis=-1 + split_fn(state_dict[f"{model_prefix}.mlp.gate_up_fused_proj.weight"]), axis=-1 ) else: - unfused_state_dict["mlp.gate_proj.weight"] = state_dict[ - "qwen2.layers.{}.mlp.gate_proj.weight".format(idx) - ] - unfused_state_dict["mlp.up_proj.weight"] = state_dict["qwen2.layers.{}.mlp.up_proj.weight".format(idx)] + unfused_state_dict["mlp.gate_proj.weight"] = state_dict[f"{model_prefix}.mlp.gate_proj.weight"] + unfused_state_dict["mlp.up_proj.weight"] = state_dict[f"{model_prefix}.mlp.up_proj.weight"] concated_ffn1_weight = np.concatenate( [unfused_state_dict["mlp.gate_proj.weight"], unfused_state_dict["mlp.up_proj.weight"]], axis=-1 ) @@ -781,14 +771,14 @@ def set_state_dict(self, state_dict): ffn1_weight.cast(self.transformer_block.ffn1_weights[idx].dtype) ) - ffn2_weight = paddle.to_tensor(state_dict["qwen2.layers.{}.mlp.down_proj.weight".format(idx)]) + ffn2_weight = paddle.to_tensor(state_dict[f"{model_prefix}.mlp.down_proj.weight"]) if self.use_weight_only: ffn2_quanted_weight, ffn2_weight_scale = weight_quantize(ffn2_weight, algo=self.quant_algo) self.transformer_block.ffn2_weights[idx].set_value(ffn2_quanted_weight) self.transformer_block.ffn2_weights_scale[idx].set_value(ffn2_weight_scale) elif "fp8" in self.quant_type: self.transformer_block.ffn2_weights[idx].copy_( - paddle.to_tensor(state_dict["qwen2.layers.{}.mlp.down_proj.weight".format(idx)]) + paddle.to_tensor(state_dict[f"{model_prefix}.mlp.down_proj.weight"]) .transpose([1, 0]) .cast("float8_e4m3fn"), False, @@ -811,57 +801,57 @@ def set_state_dict(self, state_dict): if "fp8" not in self.quant_type and "a8w8" in self.quant_type: if self.shift_smooth_all_linears: if self.use_fake_parameter: - if "qwen2.layers.{}.self_attn.o_proj.shift_bias".format(idx) not in state_dict: - state_dict["qwen2.layers.{}.self_attn.o_proj.shift_bias".format(idx)] = paddle.zeros( + if f"{model_prefix}.self_attn.o_proj.shift_bias" not in state_dict: + state_dict[f"{model_prefix}.self_attn.o_proj.shift_bias"] = paddle.zeros( shape=[ (self.num_attention_heads // self.config.tensor_parallel_degree) * (self.hidden_size // self.num_attention_heads) ], dtype=paddle.get_default_dtype(), ) - state_dict["qwen2.layers.{}.self_attn.o_proj.smooth_weight".format(idx)] = paddle.ones( + state_dict[f"{model_prefix}.self_attn.o_proj.smooth_weight"] = paddle.ones( shape=[ (self.num_attention_heads // self.config.tensor_parallel_degree) * (self.hidden_size // self.num_attention_heads) ], dtype=paddle.get_default_dtype(), ) - state_dict["qwen2.layers.{}.mlp.down_proj.shift_bias".format(idx)] = paddle.zeros( + state_dict[f"{model_prefix}.mlp.down_proj.shift_bias"] = paddle.zeros( shape=[self.intermediate_size // self.config.tensor_parallel_degree], dtype=paddle.get_default_dtype(), ) - state_dict["qwen2.layers.{}.mlp.down_proj.smooth_weight".format(idx)] = paddle.ones( + state_dict[f"{model_prefix}.mlp.down_proj.smooth_weight"] = paddle.ones( shape=[self.intermediate_size // self.config.tensor_parallel_degree], dtype=paddle.get_default_dtype(), ) self.transformer_block.linear_shifts[idx].set_value( - paddle.to_tensor(state_dict["qwen2.layers.{}.self_attn.o_proj.shift_bias".format(idx)]).astype( + paddle.to_tensor(state_dict[f"{model_prefix}.self_attn.o_proj.shift_bias"]).astype( paddle.get_default_dtype() ) ) self.transformer_block.linear_smooths[idx].set_value( - paddle.to_tensor( - state_dict["qwen2.layers.{}.self_attn.o_proj.smooth_weight".format(idx)] - ).astype(paddle.get_default_dtype()) + paddle.to_tensor(state_dict[f"{model_prefix}.self_attn.o_proj.smooth_weight"]).astype( + paddle.get_default_dtype() + ) ) self.transformer_block.ffn2_shifts[idx].set_value( - paddle.to_tensor(state_dict["qwen2.layers.{}.mlp.down_proj.shift_bias".format(idx)]).astype( + paddle.to_tensor(state_dict[f"{model_prefix}.mlp.down_proj.shift_bias"]).astype( paddle.get_default_dtype() ) ) self.transformer_block.ffn2_smooths[idx].set_value( - paddle.to_tensor(state_dict["qwen2.layers.{}.mlp.down_proj.smooth_weight".format(idx)]).astype( + paddle.to_tensor(state_dict[f"{model_prefix}.mlp.down_proj.smooth_weight"]).astype( paddle.get_default_dtype() ) ) if self.shift: if self.use_fake_parameter: - if "qwen2.layers.{}.input_layernorm.bias".format(idx) not in state_dict: - state_dict["qwen2.layers.{}.input_layernorm.bias".format(idx)] = paddle.zeros( + if f"{model_prefix}.input_layernorm.bias" not in state_dict: + state_dict[f"{model_prefix}.input_layernorm.bias"] = paddle.zeros( shape=[self.hidden_size], dtype=paddle.get_default_dtype() ) - state_dict["qwen2.layers.{}.post_attention_layernorm.bias".format(idx)] = paddle.zeros( + state_dict[f"{model_prefix}.post_attention_layernorm.bias"] = paddle.zeros( [self.hidden_size], dtype=paddle.get_default_dtype() ) unfused_state_dict["self_attn.q_proj.bias"] = paddle.zeros( @@ -884,26 +874,22 @@ def set_state_dict(self, state_dict): ) else: unfused_state_dict["self_attn.q_proj.bias"] = state_dict[ - "qwen2.layers.{}.self_attn.q_proj.bias".format(idx) + f"{model_prefix}.self_attn.q_proj.bias" ] unfused_state_dict["self_attn.k_proj.bias"] = state_dict[ - "qwen2.layers.{}.self_attn.k_proj.bias".format(idx) + f"{model_prefix}.self_attn.k_proj.bias" ] unfused_state_dict["self_attn.v_proj.bias"] = state_dict[ - "qwen2.layers.{}.self_attn.v_proj.bias".format(idx) - ] - unfused_state_dict["mlp.gate_proj.bias"] = state_dict[ - "qwen2.layers.{}.mlp.gate_proj.bias".format(idx) - ] - unfused_state_dict["mlp.up_proj.bias"] = state_dict[ - "qwen2.layers.{}.mlp.up_proj.bias".format(idx) + f"{model_prefix}.self_attn.v_proj.bias" ] + unfused_state_dict["mlp.gate_proj.bias"] = state_dict[f"{model_prefix}.mlp.gate_proj.bias"] + unfused_state_dict["mlp.up_proj.bias"] = state_dict[f"{model_prefix}.mlp.up_proj.bias"] self.transformer_block.ln_biases[idx].set_value( - paddle.to_tensor(state_dict["qwen2.layers.{}.input_layernorm.bias".format(idx)]) + paddle.to_tensor(state_dict[f"{model_prefix}.input_layernorm.bias"]) ) self.transformer_block.ffn_ln_biases[idx].set_value( - paddle.to_tensor(state_dict["qwen2.layers.{}.post_attention_layernorm.bias".format(idx)]) + paddle.to_tensor(state_dict[f"{model_prefix}.post_attention_layernorm.bias"]) ) concated_qkv_biases = np.concatenate( [ @@ -922,18 +908,18 @@ def set_state_dict(self, state_dict): if self.shift_smooth_all_linears: if self.use_fake_parameter: - if "qwen2.layers.{}.self_attn.o_proj.bias".format(idx) not in state_dict: - state_dict["qwen2.layers.{}.self_attn.o_proj.bias".format(idx)] = paddle.zeros( + if f"{model_prefix}.self_attn.o_proj.bias" not in state_dict: + state_dict[f"{model_prefix}.self_attn.o_proj.bias"] = paddle.zeros( [self.hidden_size], dtype=paddle.get_default_dtype() ) - state_dict["qwen2.layers.{}.mlp.down_proj.layer.bias".format(idx)] = paddle.zeros( + state_dict[f"{model_prefix}.mlp.down_proj.layer.bias"] = paddle.zeros( [self.hidden_size], dtype=paddle.get_default_dtype() ) self.transformer_block.linear_biases[idx].set_value( - paddle.to_tensor(state_dict["qwen2.layers.{}.self_attn.o_proj.bias".format(idx)]) + paddle.to_tensor(state_dict[f"{model_prefix}.self_attn.o_proj.bias"]) ) self.transformer_block.ffn2_biases[idx].set_value( - paddle.to_tensor(state_dict["qwen2.layers.{}.mlp.down_proj.layer.bias".format(idx)]) + paddle.to_tensor(state_dict[f"{model_prefix}.mlp.down_proj.layer.bias"]) ) def remove_padding(self, input_ids, seq_lens_this_time): @@ -1286,7 +1272,14 @@ def forward( kwargs["padding_offsets"] = padding_offset kwargs["max_input_length"] = self.max_seq_len - inputs_embeds = self.embed_tokens(ids_remove_padding) + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(ids_remove_padding) + else: + assert len(inputs_embeds.shape) == 3 + # This is the case in the image-to-text model such as qwen2-vl, + # In the prefill phase, the language model is first fed with inputs_embeds instead of input_ids + # but in decoder phase, the language model is fed with input_ids just like normal text-to-text model. + inputs_embeds = inputs_embeds.reshape([-1, inputs_embeds.shape[2]]) with dy2st_nocheck_guard_context(): hidden_states, _ = self.transformer_block( @@ -1425,6 +1418,7 @@ def get_cache_kvs_shape( def prepare_inputs_for_generation(self, **kwargs): # only last token for inputs_ids if cache is defined in kwargs input_ids = kwargs["input_ids"] + inputs_embeds = kwargs.get("inputs_embeds", None) src_mask = kwargs.get("src_mask", None) block_tables = kwargs.get("block_tables", None) @@ -1446,6 +1440,7 @@ def prepare_inputs_for_generation(self, **kwargs): model_inputs = { "input_ids": input_ids, + "inputs_embeds": inputs_embeds, "src_mask": src_mask, "rope_emb": rope_emb, "pre_caches": pre_caches, @@ -1466,6 +1461,7 @@ def prepare_inputs_for_generation(self, **kwargs): def forward( self, input_ids, + inputs_embeds=None, src_mask=None, pre_caches=None, caches=None, @@ -1483,6 +1479,7 @@ def forward( ): outputs = self.qwen2( input_ids, + inputs_embeds=inputs_embeds, src_mask=src_mask, caches=caches, rope_emb=rope_emb, @@ -1514,3 +1511,15 @@ def set_state_dict(self, state_dict): paddle.to_tensor(state_dict["lm_head.weight"]).cast(self.lm_head.weight.dtype) ) self.qwen2.set_state_dict({k: state_dict[k] for k in state_dict.keys()}) + + +class Qwen2VLForConditionalGenerationBlockInferenceModel(Qwen2ForCausalLMBlockInferenceModel): + """ + NOTE: (changwenbin) This class inherits from Qwen2ForCausalLMBlockInferenceModel. + Used only for QWen2-VL's second part. + """ + + # NOTE: (changwenbin) This function corresponds to QWen2-VL's second part, only used for QWen2-VL. + def __init__(self, config): + super().__init__(config) + self.qwen2.base_model_prefix = "model" diff --git a/paddlenlp/transformers/auto/configuration.py b/paddlenlp/transformers/auto/configuration.py index ff89b81d5cc2..f2058a5ec389 100644 --- a/paddlenlp/transformers/auto/configuration.py +++ b/paddlenlp/transformers/auto/configuration.py @@ -231,6 +231,9 @@ def __init__(self, mapping): self._modules = {} def __getitem__(self, key): + # NOTE: (changwenbin) This is to enable the qwen2_vl language model to use qwen2 reasoning optimization + if key == "qwen2_vl": + key = "qwen2" if key in self._extra_content: return self._extra_content[key] if key not in self._mapping: